# Provide some reference for different approaches to $a>2$

There is some complexity in making a decision about how to define alignment in this setting.  We want to make sure that some sense of what can happen is presented here.

Run the first cell below to store the necessary functions in the computer.  The cell below shows how to use these functions to process your data.

In [3]:
import pandas
import numpy as np
from scipy.spatial.distance import cdist
from scipy.stats import linregress
from scipy.special import gamma
from itertools import permutations, combinations
import matplotlib.pyplot as plt
%matplotlib inline

# MINIMUM_DISTANCE_BASE is a quantity that helps make sure the difference between two vectors is never actually 0
# This is a reasonable assumption under the belief that observations are made in the presence of noise
# If we actually knew the distribution of the noise we could set this in an informed fashion
# Without that, we can simply set this to a "small" number and then play around with it
MINIMUM_DISTANCE_BASE = .001

# Set TEAM_SIZE_SCALING = True to allow for the distance between larger teams to be scaled down
TEAM_SIZE_SCALING = True

def extract_columns(columns, all_data, team_id_name='OverallGroupID', outcome_column_name=None):
    data_matrix = all_data.values
    team_id_index = np.where(imported_data.columns == team_id_name)[0][0]
    if outcome_column_name:
        outcome_index = np.where(imported_data.columns == outcome_column_name)[0][0]
        
    results = {}
    for row_num, row in enumerate(data_matrix):
        team = int(row[team_id_index])
        if team not in results:
            results[team] = {'attribute_dict': {column: [] for column in columns}}
        if outcome_column_name:
            results[team][outcome_column_name] = row[outcome_index]
        for column in sorted(columns):
            results[team]['attribute_dict'][column].append(all_data[column][row_num])
        results[team]['attributes'] = np.array([results[team]['attribute_dict'][column] for column in columns]).T
        
    return results

def _determine_auxiliary_terms(attributes, minimum_distance_base, team_size_scaling):
    d = len(attributes)
    team_size_scaling_value = 1 / np.sqrt(d) if team_size_scaling else 1.0
    minimum_distance = minimum_distance_base * gamma((d + 1) / 2) / gamma(d / 2) * team_size_scaling_value
    return minimum_distance, team_size_scaling_value

def compute_potential_energy(attributes, minimum_distance_base=MINIMUM_DISTANCE_BASE, team_size_scaling=TEAM_SIZE_SCALING):
    minimum_distance, team_size_scaling_value = _determine_auxiliary_terms(attributes, minimum_distance_base, team_size_scaling)
    distance_matrix = cdist(attributes.T, attributes.T)
    d = np.fmax(distance_matrix * team_size_scaling_value, minimum_distance)
    distances = d[np.where(np.triu(d, 1))]
    return minimum_distance / len(distances) * np.sum(1 / distances)

def compute_svd_alignment(attributes, minimum_distance_base=MINIMUM_DISTANCE_BASE, team_size_scaling=TEAM_SIZE_SCALING, num_draws=50, max_draws=10000):
    minimum_distance, _ = _determine_auxiliary_terms(attributes, minimum_distance_base, team_size_scaling)
    
    # The team size is already accounted for in the Frobenius norm
    # If we want to shut off the impact of team size, we multiply back in the 
    def _svd_alignment(x):
        singular_values = np.linalg.svd(x, compute_uv=False)
        team_size_scaling_value = (1.0 if not team_size_scaling else len(x)) / np.linalg.norm(x, ord='fro')
        return singular_values[0] / singular_values[1] * team_size_scaling_value
    
    # Technically, the restriction to all positive values isn't necessary, but I think it makes sense
    # The random_shift goes up to 2 * minimum_distance to allow a decent potential of convergence even for high alignment
    vals = []
    for _ in range(max_draws):
        random_shift = np.random.uniform(-1, 1, size=attributes.shape)
        random_shift = np.random.uniform(0, 2 * minimum_distance) * random_shift / np.sqrt(np.sum(random_shift ** 2, axis=0)[None, :])
        shifted_attributes = np.fmax(attributes + random_shift, 0)
        distance_matrix = cdist(shifted_attributes.T, shifted_attributes.T)
        distances = distance_matrix[np.where(np.triu(distance_matrix, 1))]
        if not all(distances > minimum_distance):
            continue
        
        vals.append(_svd_alignment(shifted_attributes))
        if len(vals) >= num_draws:
            break
    else:
        raise ValueError('SVD alignment falied to converge')
    
    return np.mean(vals)

def compute_average_distance(attributes, minimum_distance_base=MINIMUM_DISTANCE_BASE, team_size_scaling=TEAM_SIZE_SCALING):
    minimum_distance, team_size_scaling_value = _determine_auxiliary_terms(attributes, minimum_distance_base, team_size_scaling)
    distance_matrix = cdist(attributes.T, attributes.T)
    d = np.fmax(distance_matrix * team_size_scaling_value, minimum_distance)
    distances = d[np.where(np.triu(d, 1))]
    return 1 / len(distances) * np.sum(distances)

def compute_pairwise_distances(attributes, minimum_distance_base=MINIMUM_DISTANCE_BASE, team_size_scaling=TEAM_SIZE_SCALING):
    minimum_distance, team_size_scaling_value = _determine_auxiliary_terms(attributes, minimum_distance_base, team_size_scaling)
    distances = []
    for i1, i2 in combinations(np.arange(attributes.shape[1]), 2):
        distances.append(np.linalg.norm(attributes[:, i1] - attributes[:, i2]))
    return np.fmax(np.array(distances) * team_size_scaling_value, minimum_distance)

def add_metrics(info, minimum_distance_base=MINIMUM_DISTANCE_BASE, team_size_scaling=TEAM_SIZE_SCALING):
    for team_id, stuff in info.items():
        attributes = stuff['attributes']
        stuff['metrics'] = {
            'energy': compute_potential_energy(attributes, minimum_distance_base, team_size_scaling),
            'avg_dist': compute_average_distance(attributes, minimum_distance_base, team_size_scaling),
            'svd': compute_svd_alignment(attributes, minimum_distance_base, team_size_scaling),
            'pairwise': compute_pairwise_distances(attributes, minimum_distance_base, team_size_scaling),
        }
        
def print_metrics(info, team_id_name='OverallGroupID', outcome_column_name='outcome', savefile=None, csv_sep='\t', suppress_output=False):
    header_printed = False
    lines = []
    def print_maybe(string):
        if not suppress_output:
            print(string)
    
    for team_id, stuff in info.items():
        if not header_printed:
            attribute_names = [attribute for attribute in stuff['attribute_dict'].keys()]
            metric_names = [metric for metric in sorted(stuff['metrics'].keys()) if metric != 'pairwise']
            s = [team_id_name, outcome_column_name]
            s += metric_names
            if 'pairwise' in stuff['metrics']:
                for attribute_1, attribute_2 in combinations(attribute_names, 2):
                    s.append('--'.join(['pairwise', attribute_1, attribute_2]))
            s = csv_sep.join(s)
            print_maybe(s)
            lines.append(s)
            header_printed = True
        s = [team_id, stuff[outcome_column_name]] + [stuff['metrics'][metric] for metric in metric_names]
        s += stuff['metrics']['pairwise'].tolist()
        s = csv_sep.join((str(ss) for ss in s))
        print_maybe(s)
        lines.append(s)
    
    if savefile:
        with open(savefile, 'w') as f:
            f.writelines(l + '\n' for l in lines)

## These lines are the lines that are used to compute the alignment values and print/save them

* Fill in your file names (right now `'my-data-file.csv'` and `'where-i-store-alignment-values.csv'`)
* Set `outcome_column_name` to be whatever the name of the column that the outcome
* Replace `('attribute_0', 'attribute_1', 'attribute_2', 'attribute_3')` with whatever attributes you want to study. 
* Set suppress_output equal to False if you want the data to appear on this screen. 
* Then, copy/paste the data into Microsoft Excel and separate into columns where spaces appear
     - Data => Text to Columns => Delimited => Check "space" => Finish

This block will not run as it is right now because these files do not exist

In [4]:
outcome_column_name = 'DV'
attributes_to_study = ('attribute_0', 'attribute_1', 'attribute_2', 'attribute_3')

imported_data = pandas.read_csv('my-data-file.csv', index_col=False, sep=',')
results = extract_columns(attributes_to_study, imported_data, outcome_column_name=outcome_column_name)
add_metrics(results)
# Set suppress_output=False to print to screen
print_metrics(results, savefile='where-i-store-alignment-values.csv', suppress_output=False, outcome_column_name=outcome_column_name)

OverallGroupID	CLU4_1	avg_dist	energy	svd	pairwise--Challenger--Doer	pairwise--Challenger--Innovator	pairwise--Challenger--Organizer	pairwise--Challenger--TeamBuilder	pairwise--Doer--Innovator	pairwise--Doer--Organizer	pairwise--Doer--TeamBuilder	pairwise--Innovator--Organizer	pairwise--Innovator--TeamBuilder	pairwise--Organizer--TeamBuilder
104	1	0.45848015403678777	0.0018311550310138372	3.83371359922117	0.43033148291193835	0.3191423692521122	0.48112522432468957	0.21516574145596687	0.5692750425533134	0.5181877251716015	0.48112522432468957	0.7817359599705718	0.13608276348795315	0.6526300069150409
105	1	0.6468626703299208	0.0012654713991637058	4.1230091737788	0.44095855184409954	0.7312470322826751	0.8291561975888491	0.2041241452319297	0.7312470322826762	0.6614378277661488	0.39086797998528516	1.1365151414154857	0.6508541396588864	0.6922186552431724
202	1	0.4767284344043117	0.001547199039330228	4.477388105668258	0.5055250296034364	0.38005847503304885	0.8333333333333329	0.35746017649212086