In [1]:
import pandas as pd
from collections import Counter
import math

In [2]:
teaming = pd.read_csv('../results/20170725-125805/teaming1.out', header=None, names=['hash','team','semester'])
data = pd.read_csv('../project4.csv')
data.head()

Unnamed: 0,hash,Sex,Discipline,Nationality,Semester
0,59ec0b057eedda06d9d6b624f8e44c93,f,Business,Sri Lankan,WT-15
1,1d7d77b5a1211746723dce012152b763,m,Life Sciences,German,WT-15
2,82c778ace992789d990201847ec0e493,f,Social Sciences,German,WT-15
3,b6b46b3be7a6db0261ed697030962129,f,Business,French,WT-15
4,a09625f8f1ddd73adeb6db620222c2c6,m,Humanities,German,WT-15


In [3]:
teaming.head()

Unnamed: 0,hash,team,semester
0,58f15d2b9b461bc55420caeeeef6d22a,0,ST-16
1,72ba7c279977dfc690e11b75dc1fba5c,0,ST-16
2,d28f3ce8a77fc7a7b840726d3b6fb200,0,ST-16
3,d1709e5f4636121ba142156547c39e57,0,ST-16
4,eace21281285a59c78e17eb3a0211e0c,0,ST-16


In [4]:
teams = pd.merge(teaming, data, on='hash')[['team', 'Sex', 'Discipline', 'Nationality', 'Semester']]
teams.head()

Unnamed: 0,team,Sex,Discipline,Nationality,Semester
0,0,f,Social Sciences,German,ST-16
1,0,f,Creative Disciplines,German,ST-16
2,0,f,Social Sciences,Japanese,ST-16
3,0,m,Social Sciences,German,ST-16
4,0,m,Engineering,Argentine,ST-16


In [5]:
len(teams[teams['Nationality'] != 'German'])

79

In [6]:
teams['Discipline'].value_counts()

Engineering             88
Business                85
Social Sciences         43
Creative Disciplines    36
Life Sciences           30
Humanities              24
Media                   17
Name: Discipline, dtype: int64

In [7]:
teams['Nationality'].value_counts()

German                        244
Chilean                         7
Italian                         6
Dutch or Netherlandish          5
American or US                  5
Danish                          4
Russian                         4
Austrian                        3
Indian                          3
Chinese                         3
Japanese                        3
Argentine                       3
Mexican                         3
French                          2
Brazilian                       2
Egyptian                        2
Vietnamese                      2
Spanish                         2
Polish                          2
Swiss                           1
Luxembourgish                   1
Canadian                        1
Ecuadorean or Ecuadorian        1
German/Vietnamese               1
Nepalese                        1
Kenyan                          1
Moroccan                        1
Slovenian (or Slovene)          1
Lithuanian                      1
British and No

In [8]:
teams['Sex'].value_counts()

f    162
m    157
F      3
Name: Sex, dtype: int64

In [9]:
def single_entropy(team):
    counter = Counter(team)
    l = len(team)
    return -sum((c/l) * math.log(c/l) for c in counter.values())
        
def entropy(teams):
    return sum(single_entropy(team) for team in teams)

def max_entropy(value_map):
    n_teams = 16
    teams = []
    for _ in range(n_teams):
        teams.append([])
    i = 0
    for attr_id, value_count in enumerate(value_map):
        base_n = value_count // n_teams
        for team in teams:
            team.extend([attr_id] * base_n)
        value_count = value_count % n_teams
        while value_count > 0:
            teams[i].append(attr_id)
            value_count -= 1
            i = (i + 1) % n_teams
    return entropy(teams)
    

def max_entropy_teaming(teaming):
    attrs = ['Sex', 'Nationality', 'Discipline']
    return sum(max_entropy(teaming[attr].value_counts()) for attr in attrs)

def print_entropies(teaming):
    attrs = ['Sex', 'Nationality', 'Discipline']
    for attr in attrs:
        print(attr)
        print('no teaming: {:0.4f}'.format(single_entropy(teams[attr].values)))
        print('teaming: {:0.4f}'.format(max_entropy(teaming[attr].value_counts())))
        print('')
    print('total teaming: {:0.4f}'.format(max_entropy_teaming(teaming)))


In [10]:
max_entropy(teams['Nationality'].value_counts())

15.150218430826644

In [11]:
max_entropy_teaming(teams)

55.098895307025195

In [12]:
single_entropy(teams['Nationality'].values)

1.3784000129863025

In [13]:
semesters = teams['Semester'].unique()
for sem in semesters:
    print(max_entropy_teaming(teams[teams['Semester'] == sem]))

42.650438082902824
43.99573423209929
42.19514805803991
45.07472937880132
