In [1]:
import pandas as pd
from collections import Counter
import math

In [2]:
teaming = pd.read_csv('../results/20170730-221813/teaming2.out', header=None, names=['hash','team','semester'])
data = pd.read_csv('../project4.csv')

In [3]:
teaming.head()

Unnamed: 0,hash,team,semester
0,8896b6354cd35afd2e5ae3d690979490,0,ST-16
1,91aa98d72504dc8abbc98b6e5d376927,0,ST-16
2,729d43e103d18246e36e3b37f7080c45,0,ST-16
3,362be6f258a90bd5939961dab97a5414,0,ST-16
4,8714daecb7866f07220d7e3f5cd80be5,0,ST-16


In [4]:
teams = pd.merge(teaming, data, on='hash')[['team', 'Sex', 'Discipline', 'Nationality', 'Semester']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(teams)

     team  Sex            Discipline                 Nationality Semester
0       0    f              Business                      German    ST-16
1       0    f           Engineering                      German    ST-16
2       0    m           Engineering                      German    ST-16
3       0    m         Life Sciences                     Russian    ST-16
4       0    f  Creative Disciplines                      German    ST-16
5       1    f            Humanities                      German    ST-16
6       1    m           Engineering                     Chilean    ST-16
7       1    m              Business                      German    ST-16
8       1    f              Business                      German    ST-16
9       1    m       Social Sciences                      German    ST-16
10      2    f           Engineering                      German    ST-16
11      2    f              Business                     Mexican    ST-16
12      2    f              Business  

In [5]:
len(teams[teams['Nationality'] != 'German'])

79

In [6]:
teams['Discipline'].value_counts()

Engineering             88
Business                85
Social Sciences         43
Creative Disciplines    36
Life Sciences           30
Humanities              24
Media                   17
Name: Discipline, dtype: int64

In [7]:
teams['Nationality'].value_counts()

German                        244
Chilean                         7
Italian                         6
American or US                  5
Dutch or Netherlandish          5
Russian                         4
Danish                          4
Indian                          3
Argentine                       3
Japanese                        3
Chinese                         3
Austrian                        3
Mexican                         3
Vietnamese                      2
Brazilian                       2
Spanish                         2
French                          2
Egyptian                        2
Polish                          2
Hungarian                       1
Swiss                           1
Sri Lankan                      1
Nepalese                        1
Slovenian (or Slovene)          1
Cuban                           1
British and Northern Irish      1
Macedonian                      1
Ecuadorean or Ecuadorian        1
Lithuanian                      1
Canadian      

In [8]:
teams['Sex'].value_counts()

f    162
m    157
F      3
Name: Sex, dtype: int64

In [48]:
def single_entropy(team):
    counter = Counter(team)
    l = len(team)
    return -sum((c/l) * math.log(c/l) for c in counter.values())
        
def entropy(teams):
    return sum(single_entropy(team) for team in teams)

def max_entropy(value_map):
    n_teams = 16
    teams = []
    for _ in range(n_teams):
        teams.append([])
    i = 0
    for attr_id, value_count in enumerate(value_map):
        base_n = value_count // n_teams
        for team in teams:
            team.extend([attr_id] * base_n)
        value_count = value_count % n_teams
        while value_count > 0:
            teams[i].append(attr_id)
            value_count -= 1
            i = (i + 1) % n_teams
    return entropy(teams)


def min_entropy(value_map):
    n_teams = 16
    teams = []
    for _ in range(n_teams):
        teams.append([])
    offset = 0
    for attr_id, value_count in enumerate(value_map):
        for i in range(value_count):
            teams[((i + offset) // 5) % 16].append(attr_id)
        offset += value_count
    return entropy(teams)
    

def max_entropy_teaming(teaming):
    attrs = ['Sex', 'Nationality', 'Discipline']
    return sum(max_entropy(teaming[attr].value_counts()) for attr in attrs)

def min_entropy_teaming(teaming):
    attrs = ['Sex', 'Nationality', 'Discipline']
    return sum(min_entropy(teaming[attr].value_counts()) for attr in attrs)

def print_entropies(teaming):
    attrs = ['Sex', 'Nationality', 'Discipline']
    for attr in attrs:
        print(attr)
        print('no teaming: {:0.4f}'.format(single_entropy(teams[attr].values)))
        print('teaming: {:0.4f}'.format(max_entropy(teaming[attr].value_counts())))
        print('')
    print('total teaming: {:0.4f}'.format(max_entropy_teaming(teaming)))


In [49]:
for semester in teaming["semester"].unique():
    current_team = teams[teams['Semester'] == semester]
    print(semester, min_entropy_teaming(current_team))

ST-16 9.49094037803773
WT-16 10.407862037073782
ST-17 9.317638041395403
WT-15 10.268601673799896


In [22]:
max_entropy(teams['Nationality'].value_counts())

15.150218430826644

In [27]:
for semester in teaming["semester"].unique():
    current_team = teams[teams['Semester'] == semester]
    print(semester, max_entropy_teaming(current_team))

ST-16 42.650438082902824
WT-16 45.07472937880132
ST-17 42.19514805803991
WT-15 43.99573423209929


In [11]:
max_entropy_teaming(teams)

55.098895307025195

In [12]:
single_entropy(teams['Nationality'].values)

1.3784000129863025

In [13]:
print_entropies(teams)

Sex
no teaming: 0.7581
teaming: 11.5602

Nationality
no teaming: 1.3784
teaming: 15.1502

Discipline
no teaming: 1.7874
teaming: 28.3884

total teaming: 55.0989
