In [103]:
import json
import re
from fuzzywuzzy import fuzz
import numpy as np
import copy
from scipy.stats import binom

In [2]:
with open("images_metadata.json") as f:
    md = json.load(f)

In [53]:
def create_count_map(md, sub_map=None):
    sub_map = sub_map or {}
    person_to_player_to_count = {}
    player_to_count = {}
    for grid in md:
        person = grid["submitter"]
        if person not in person_to_player_to_count:
            person_to_player_to_count[person] = {}

        for player in grid["responses"].values():
            player = re.sub("\s+", " ", sub_map.get(player, player)).strip()
            if not player:
                continue
            if player not in person_to_player_to_count[person]:
                person_to_player_to_count[person][player] = 0
            if player not in player_to_count:
                player_to_count[player] = 0
            person_to_player_to_count[person][player] += 1
            player_to_count[player] += 1
    return person_to_player_to_count, player_to_count

In [54]:
def create_sub_map(player_to_count):
    players = sorted([p for p in player_to_count], key=lambda x: player_to_count[x], reverse=True)
    
    sub_map = {}
    threshold = 80
    for i, p1 in enumerate(players):
        for p2 in players[i + 1:]:
            ratio = fuzz.ratio(p1, p2)
            if (ratio > threshold or (p2 in p1 and "blue" not in p1.lower())) and p2 not in sub_map:
                target = p1
                while target in sub_map:
                    target = sub_map[target]
                sub_map[p2] = target
    return sub_map

In [129]:
def create_person_to_total(person_to_player_to_count):
    person_to_total = {}
    for person in person_to_player_to_count:
        person_to_total[person] = sum([c for c in person_to_player_to_count[person].values()])
    return person_to_total

def get_entropy(person_to_player_to_count, person_to_total):
    person_to_entropy = {}
    for person in person_to_player_to_count:
        person_to_entropy[person] = 0
        total = person_to_total[person]
        for player, count in person_to_player_to_count[person].items():
            p = count / total
            person_to_entropy[person] += -(p * np.log2(p))
    return person_to_entropy

def laplace_smooth(person_to_player_to_count, person_to_total, player_to_count):
    person_to_player_to_count_copy = copy.deepcopy(person_to_player_to_count)
    person_to_total_copy = copy.deepcopy(person_to_total)
    for person in person_to_player_to_count_copy:
        for player in player_to_count:
            if player not in person_to_player_to_count_copy[person]:
                person_to_player_to_count_copy[person][player] = 0
            person_to_player_to_count_copy[person][player] += 1
            person_to_total_copy[person] += 1
    return person_to_player_to_count_copy, person_to_total_copy
    
def calculate_cross_entropy(person_to_player_to_count, person_to_total, player_to_count):
    def calc_ce_inner(pa, pb):
        ce = 0
        a_total = person_to_total_copy[pa]
        b_total = person_to_total_copy[pb]
        for player in player_to_count:
            a_freq = person_to_player_to_count_copy[pa][player] / a_total
            b_freq = person_to_player_to_count_copy[pb][player] / b_total
            ce -= (a_freq * np.log2(b_freq))
        return ce
            
    # laplace smoothing so we don't take log(0)
    person_to_player_to_count_copy, person_to_total_copy = laplace_smooth(person_to_player_to_count, person_to_total)

    pair_to_ce = {}
    people = [p for p in person_to_player_to_count_copy]
    for i, pa in enumerate(people):
        for pb in people[i + 1:]:
            ce = (calc_ce_inner(pa, pb) + calc_ce_inner(pa, pb)) / 2
            pair_to_ce[f"{pa} - {pb}"] = ce
    return pair_to_ce

def get_favorite_players(person_to_player_to_count, person_to_total, player_to_count):
    person_to_player_to_count_copy, person_to_total_copy = laplace_smooth(person_to_player_to_count, person_to_total, player_to_count)

    person_to_player_to_favorite = {}
    for person in person_to_total_copy:
        person_to_player_to_favorite[person] = {}
        
        for player in player_to_count:

            p = 0
            for other in filter(lambda x: x != person, person_to_total_copy):
                p += person_to_player_to_count_copy[other][player] / person_to_total_copy[person]
            p /= (len(person_to_total_copy) - 1)

            K = person_to_player_to_count_copy[person][player]
            N = person_to_total_copy[person]
            prob = 1 - (binom.cdf(K, N, p) - binom.pmf(K, N, p))
            person_to_player_to_favorite[person][player] = prob
    return person_to_player_to_favorite 

def get_player_entropy(person_to_player_to_count, person_to_total, player_to_count):
    player_entropy = {}
    for player in player_to_count:
        player_dist = []
        for person in person_to_player_to_count:
            if player in person_to_player_to_count[person]:
                player_dist.append(person_to_player_to_count[person][player] / person_to_total[person])
        player_dist_norm = [x / sum(player_dist) for x in player_dist]
        player_entropy[player] = -sum([p * np.log2(p) for p in player_dist_norm])
    return player_entropy
        

In [55]:
person_to_player_to_count, player_to_count = create_count_map(md)
sub_map = create_sub_map(player_to_count)
person_to_player_to_count, player_to_count = create_count_map(md, sub_map)

In [130]:
person_to_total = create_person_to_total(person_to_player_to_count)
person_to_entropy = get_entropy(person_to_player_to_count, person_to_total)
person_to_player_to_favorite = get_favorite_players(person_to_player_to_count, person_to_total, player_to_count)
player_entropy = get_player_entropy(person_to_player_to_count, person_to_total, player_to_count)

In [71]:
print("Most Concentrated to Least")
for i, (person, entropy) in enumerate(sorted(person_to_entropy.items(), key=lambda x: x[1], reverse=False)):
    print(f"{i + 1}. {person} ({round(float(person_to_entropy[person]), 2)})")

Most Concentrated to Least
1. Rachel (7.73)
2. Keith (8.08)
3. Will (8.11)
4. Sam (8.19)
5. Cliff (8.48)


In [139]:
print("Least Concentrated to Most (Players)")
for i, (player, entropy) in enumerate(sorted(player_entropy.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i + 1}. {player} ({round(float(entropy), 2)})")
    if i == 14:
        break

Least Concentrated to Most (Players)
1. George Brett (2.31)
2. Randy Johnson (2.28)
3. Jim Palmer (2.28)
4. Walter Johnson (2.27)
5. Willie Mays (2.26)
6. Evan Longoria (2.25)
7. Tom Seaver (2.24)
8. Albert Pujols (2.24)
9. Ty Cobb (2.24)
10. Fernando Tatis Jr (2.24)
11. Alex Rodriguez (2.23)
12. Fergie Jenkins (2.23)
13. Nolan Ryan (2.23)
14. Pete Rose (2.22)
15. Roger Clemens (2.2)


In [102]:
print("Most Similar to Least")
ce_pair = calculate_cross_entropy(person_to_player_to_count, person_to_total, player_to_count)
for i, (pair, entropy) in enumerate(sorted(ce_pair.items(), key=lambda x: x[1], reverse=False)):
    print(f"{i + 1}. {pair} ({round(float(ce_pair[pair]), 2)})")

Most Similar to Least
1. Rachel - Sam (9.81)
2. Rachel - Keith (10.09)
3. Will - Sam (10.11)
4. Will - Rachel (10.23)
5. Keith - Sam (10.35)
6. Will - Keith (10.43)
7. Keith - Cliff (10.49)
8. Rachel - Cliff (10.68)
9. Will - Cliff (10.7)
10. Sam - Cliff (10.91)


In [126]:
print("Favorites\n\n")
for person in person_to_player_to_favorite:
    print(person)

    og_sort = sorted(person_to_player_to_favorite[person].items(), key=lambda x: person_to_player_to_count[person].get(x[0], 0), reverse=True)
    for i, (player, fav) in enumerate(sorted(og_sort, key=lambda x: x[1], reverse=False)):
        print(f"{i + 1}. {player} ({round(float(fav), 10)}, {person_to_player_to_count[person][player]})")
        if i == 14:
            break
    print()

Favorites


Will
1. Phil Niekro (0.0, 21)
2. Moisés Alou (0.0, 30)
3. Derrek Lee (0.0, 27)
4. Bill Dahlen (0.0, 14)
5. Mark Grudzielanek (1e-09, 21)
6. Cool Papa Bell (2.1e-08, 16)
7. Aramis Ramirez (4.02e-08, 21)
8. Jeromy Burnitz (6.71e-08, 11)
9. Quinton McCracker (6.71e-08, 11)
10. Grover Lowdermilk (1.185e-07, 13)
11. Boom-Boom Beck (1.1148e-06, 8)
12. Robin Roberts (1.1148e-06, 8)
13. Jimmy Bloodworth (1.1148e-06, 8)
14. Sad Sam Jones (3.1845e-06, 24)
15. Kyle Tucker (4.0542e-06, 9)

Rachel
1. Tommy Pham (0.0, 20)
2. Alfonso Soriano (9e-10, 28)
3. Tyler Clippard (2.87e-08, 13)
4. Roberto Clemente (3.36e-08, 20)
5. Jake Arrieta (7.79e-08, 20)
6. Jacoby Ellsbury (1.2425e-06, 13)
7. Jose Altuve (4.1793e-06, 14)
8. Craig Counsell (1.01783e-05, 7)
9. Sandy Koufax (1.98765e-05, 15)
10. Happ (4.60659e-05, 9)
11. Tom Glavine (6.09733e-05, 10)
12. Bryce Harper (0.0001269774, 18)
13. Jayson Werth (0.0001390509, 10)
14. Jackie Robinson (0.0001409694, 22)
15. Lou Gehrig (0.0001464986, 12)

K

In [74]:
for i, (player, count) in enumerate(sorted(player_to_count.items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, count)
    if i > 40:
        break

1 Willie Mays 126
2 Carlos Beltran 124
3 Rogers Hornsby 113
4 Jimmie Foxx 91
5 Alex Rodriguez 86
6 Tom Seaver 86
7 Heinie Manush 80
8 Max Scherzer 78
9 Zack Greinke 77
10 Frank Robinson 75
11 Barry Bonds 75
12 Randy Johnson 68
13 Roger Clemens 66
14 Mike Piazza 66
15 Vladimir Guerrero 64
16 Walter Johnson 64
17 Fergie Jenkins 63
18 Aroldis Chapman 62
19 Albert Pujols 61
20 Nolan Ryan 60
21 Kris Bryant 60
22 Steve Carlton 60
23 Cap Anson 59
24 Greg Maddux 57
25 Jackie Robinson 56
26 Ichiro Suzuki 56
27 Jim Edmonds 55
28 Sad Sam Jones 54
29 Paul Molitor 54
30 Cody Bellinger 54
31 Bert Blyleven 53
32 Alfonso Soriano 53
33 Ken Griffey Jr 53
34 Babe Ruth 53
35 José Bautista 53
36 Johnny Damon 52
37 Jeff Francoeur 51
38 Ty Cobb 51
39 Henry Aaron 50
40 David Wells 50
41 Jim Thome 49
42 Sammy Sosa 49


In [148]:
person = "Rachel"
for i, (player, count) in enumerate(sorted(person_to_player_to_count[person].items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, f"{round(100 * count / person_to_total[person], 2)}%")
    if i == 24:
        break

1 Carlos Beltran 1.51%
2 Rogers Hornsby 1.46%
3 Alfonso Soriano 1.46%
4 Willie Mays 1.3%
5 Barry Bonds 1.3%
6 Vladimir Guerrero 1.2%
7 Jackie Robinson 1.15%
8 Alex Rodriguez 1.1%
9 Aroldis Chapman 1.1%
10 Roger Clemens 1.1%
11 Kris Bryant 1.1%
12 Max Scherzer 1.04%
13 Jake Arrieta 1.04%
14 Roberto Clemente 1.04%
15 Babe Ruth 1.04%
16 Henry Aaron 1.04%
17 Greg Maddux 1.04%
18 Tommy Pham 1.04%
19 Zack Greinke 0.99%
20 Nolan Ryan 0.94%
21 Bryce Harper 0.94%
22 Mike Piazza 0.94%
23 Derek Jeter 0.89%
24 Ivan Rodriguez 0.89%
25 Paul Molitor 0.89%


In [149]:
person = "Keith"
for i, (player, count) in enumerate(sorted(person_to_player_to_count[person].items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, f"{round(100 * count / person_to_total[person], 2)}%")
    if i == 24:
        break

1 Carlos Beltran 2.32%
2 Heinie Manush 1.88%
3 Jimmie Foxx 1.75%
4 Rogers Hornsby 1.68%
5 David Wells 1.54%
6 Jeff Francoeur 1.44%
7 Willie Mays 1.41%
8 Endy Chavez 1.28%
9 Dan Haren 1.24%
10 Waite Hoyt 1.21%
11 Carlos Pena 1.14%
12 Oliver Pérez 1.11%
13 David Wright 1.07%
14 Todd Zeile 1.07%
15 Mark Koenig 1.01%
16 Mike Cameron 1.01%
17 Gary Sheffield 0.97%
18 Mark DeRosa 0.94%
19 Tom Seaver 0.91%
20 Pud Galvin 0.81%
21 Moe Berg 0.81%
22 Frank Robinson 0.77%
23 Bert Blyleven 0.77%
24 Jim Kaat 0.77%
25 Keith Hernandez 0.74%


In [150]:
person = "Will"
for i, (player, count) in enumerate(sorted(person_to_player_to_count[person].items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, f"{round(100 * count / person_to_total[person], 2)}%")
    if i == 24:
        break

1 Moisés Alou 1.58%
2 Derrek Lee 1.42%
3 Sad Sam Jones 1.27%
4 Jimmie Foxx 1.27%
5 Aramis Ramirez 1.11%
6 Phil Niekro 1.11%
7 Mark Grudzielanek 1.11%
8 Zack Greinke 1.05%
9 Frank Robinson 1.0%
10 Alex Rodriguez 0.95%
11 Tom Seaver 0.9%
12 Max Scherzer 0.9%
13 Aroldis Chapman 0.9%
14 Barry Bonds 0.84%
15 Cool Papa Bell 0.84%
16 Reggie Jackson 0.84%
17 Jim Edmonds 0.84%
18 Steve Carlton 0.84%
19 Silver King 0.79%
20 Kris Bryant 0.79%
21 Sammy Sosa 0.79%
22 Bill Dahlen 0.74%
23 Joe Morgan 0.74%
24 Javier Baez 0.74%
25 Fergie Jenkins 0.74%


In [151]:
person = "Sam"
for i, (player, count) in enumerate(sorted(person_to_player_to_count[person].items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, f"{round(100 * count / person_to_total[person], 2)}%")
    if i == 24:
        break

1 Josh Donaldson 1.58%
2 José Bautista 1.34%
3 Cody Bellinger 1.3%
4 Jim Edmonds 1.25%
5 Max Scherzer 1.2%
6 Zack Greinke 1.2%
7 Kelly Johnson 1.11%
8 Ichiro Suzuki 1.07%
9 Johnny Damon 1.07%
10 Alex Rodriguez 0.93%
11 Kris Bryant 0.93%
12 Zach Duke 0.93%
13 Willie Mays 0.88%
14 Adrian Beltr 0.88%
15 Steve Finley 0.88%
16 César Izturis 0.88%
17 Barry Bonds 0.83%
18 Walter Johnson 0.83%
19 Ben Zobrist 0.79%
20 Aroldis Chapman 0.79%
21 Greg Maddux 0.74%
22 Carlos Beltran 0.74%
23 Albert Pujols 0.74%
24 Kevin Mitchell 0.74%
25 Nelson Cruz 0.7%


In [152]:
person = "Cliff"
for i, (player, count) in enumerate(sorted(person_to_player_to_count[person].items(), key=lambda x: x[1], reverse=True)):
    print(i + 1, player, f"{round(100 * count / person_to_total[person], 2)}%")
    if i == 24:
        break

1 Tommie Agee 1.49%
2 Orlando Cepeda 1.25%
3 Willie Mays 1.2%
4 Rogers Hornsby 1.16%
5 Frank Robinson 1.12%
6 Rusty Staub 0.95%
7 Gaylord Perry 0.95%
8 Ken Griffey Jr 0.95%
9 Heinie Manush 0.95%
10 Tom Seaver 0.91%
11 Rod Carew 0.83%
12 Jerry Koosman 0.79%
13 Fergie Jenkins 0.79%
14 Luis Tiant 0.79%
15 Bert Blyleven 0.75%
16 Joe Morgan 0.75%
17 Al Simmons 0.75%
18 Pedro Martinez 0.71%
19 Willie McCovey 0.71%
20 Randy Johnson 0.71%
21 Dave Kingman 0.71%
22 Tris Speaker 0.66%
23 Cap Anson 0.66%
24 Roger Clemens 0.62%
25 Joe Foy 0.62%
