In [1]:
#import libraries
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

#import data
matches = pd.read_csv('test_player.csv')
outcomes = pd.read_csv('test_labels.csv')
champions = pd.read_csv('hero_names.csv')

#EDA on Matches table
matches.describe()

Unnamed: 0,match_id,account_id,hero_id,player_slot
count,1000000.0,1000000.0,1000000.0,1000000.0
mean,99999.5,91028.072414,50.960134,66.0
std,28867.527892,103566.804928,33.229504,64.015655
min,50000.0,0.0,0.0,0.0
25%,74999.75,0.0,21.0,2.0
50%,99999.5,39462.5,48.0,66.0
75%,124999.25,177762.0,75.0,130.0
max,149999.0,330514.0,112.0,132.0


In [2]:
#drop the account_id column, it won't be used
matches.drop(['account_id'],axis=1)
matches.head(n=10)

Unnamed: 0,match_id,account_id,hero_id,player_slot
0,50000,117784,96,0
1,50000,158361,84,1
2,50000,158362,46,2
3,50000,137970,85,3
4,50000,1090,39,4
5,50000,2391,9,128
6,50000,2393,75,129
7,50000,2394,106,130
8,50000,36737,74,131
9,50000,2392,62,132


In [3]:
#collapse data into columns so that each entry becomes all the hero IDs on a team
competitor_rows = pd.pivot_table(matches, values='hero_id', index=['match_id'], columns='player_slot')
competitor_rows.head(n=10)

player_slot,0,1,2,3,4,128,129,130,131,132
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50000,96,84,46,85,39,9,75,106,74,62
50001,44,10,57,2,106,58,61,21,18,14
50002,74,7,42,99,88,69,8,25,26,79
50003,44,15,110,56,94,2,101,32,7,72
50004,98,26,73,51,46,2,106,50,65,21
50005,16,39,36,71,46,27,106,33,21,93
50006,104,44,21,62,73,86,19,74,76,67
50007,68,23,2,76,39,65,73,21,9,30
50008,112,11,39,104,101,55,19,16,79,63
50009,8,5,42,3,92,9,31,12,74,21


In [4]:
#split the data for team 1 (player slot 0-4) and team 2 (player slot 128-132)
split1,split2 = np.split(competitor_rows,2,axis=1)

#set the columns to be the same so that we can collapse it back into 1 table
split2.columns = split1.columns

index0 = split1.index.tolist()
index1 = split2.index.tolist()

In [5]:
#generate new indexes, so that each pair of even number represents team 1 and odd number represents team 2
i = 0
while i <= 99999:
    index0[i] = i*2
    index1[i] = i*2 + 1
    i += 1
    
print(len(index0))

100000


In [6]:
split1.index = index0
split2.index = index1

In [7]:
#join the team 1 and team 2 tables back together with their new indexes and sort
join = split1.append(split2)

In [8]:
join = join.sort_index()
join.head(n=20)

player_slot,0,1,2,3,4
0,96,84,46,85,39
1,9,75,106,74,62
2,44,10,57,2,106
3,58,61,21,18,14
4,74,7,42,99,88
5,69,8,25,26,79
6,44,15,110,56,94
7,2,101,32,7,72
8,98,26,73,51,46
9,2,106,50,65,21


In [9]:
outcomes.describe()

Unnamed: 0,match_id,radiant_win
count,100000.0,100000.0
mean,99999.5,0.51861
std,28867.657797,0.499656
min,50000.0,0.0
25%,74999.75,0.0
50%,99999.5,1.0
75%,124999.25,1.0
max,149999.0,1.0


In [10]:
#determine which teams lost, and flag them for deletion
delete_list=[]
i=0
while i <= 99999:
    if outcomes.iloc[i,1] == 1:
        delete_list.append(1)
        delete_list.append(0)
        i += 1
    else:
        delete_list.append(0)
        delete_list.append(1)
        i += 1

In [11]:
#append deletion flags to dataframe
delete_array = np.asarray(delete_list)
print(type(delete_array))
join["delete"] = delete_array

<class 'numpy.ndarray'>


In [12]:
join.describe()

player_slot,0,1,2,3,4,delete
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,50.703985,50.948235,51.04427,51.03686,51.06732,0.5
std,33.218564,33.221513,33.204206,33.285523,33.216627,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,21.0,21.0,21.0,21.0,0.0
50%,47.0,48.0,48.0,48.0,49.0,0.5
75%,75.0,75.0,75.0,75.0,75.0,1.0
max,112.0,112.0,112.0,112.0,112.0,1.0


In [13]:
#delete rows belonging to losing teams
winners = join[join.delete != 1]
winners = winners.drop(["delete"],axis=1)
winners.describe()

player_slot,0,1,2,3,4
count,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50.69296,51.00354,51.25891,51.03835,51.04539
std,33.420096,33.36109,33.415914,33.424149,33.382218
min,0.0,0.0,1.0,0.0,0.0
25%,21.0,21.0,21.0,21.0,21.0
50%,47.0,48.0,49.0,48.0,48.0
75%,75.0,75.0,76.0,75.0,75.0
max,112.0,112.0,112.0,112.0,112.0


In [14]:
champions.head()

Unnamed: 0,name,hero_id,localized_name,role
0,npc_dota_hero_antimage,1,Anti-Mage,carry
1,npc_dota_hero_axe,2,Axe,initiator
2,npc_dota_hero_bane,3,Bane,disabler
3,npc_dota_hero_bloodseeker,4,Bloodseeker,carry
4,npc_dota_hero_crystal_maiden,5,Crystal Maiden,support


In [15]:
#create function to determine role of each champion on a team
def find_role(int):
    if (int == 0):
        int = 113
    champ_row = champions.loc[champions["hero_id"] == int]
    role =  champ_row["role"]
    role = role.values[0]
    return role

In [16]:
#create list of champion roles present in each team
dataset = []
for row in winners.iterrows():
    temp_list = []
    for role in range(0,5):
        temp_list.append(find_role(row[1][role]))
    dataset.append(temp_list)

In [17]:
#transform list of champion roles present in each game for apriori
oht = TransactionEncoder()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df.head()

Unnamed: 0,carry,disabler,durable,escape,initiator,jungler,nuker,pusher,support
0,True,False,True,False,False,False,True,False,True
1,True,True,False,False,False,True,False,True,False
2,False,True,True,False,True,False,True,False,False
3,True,False,False,False,True,False,True,False,False
4,True,False,False,False,True,False,True,False,False


In [18]:
#run apriori association rule mining
frequent_champsets = apriori(df, min_support=0.25, use_colnames=True)
pd.DataFrame(frequent_champsets)

Unnamed: 0,support,itemsets
0,0.86973,(carry)
1,0.32809,(disabler)
2,0.36446,(escape)
3,0.55081,(initiator)
4,0.66339,(nuker)
5,0.69226,(support)
6,0.27262,"(disabler, carry)"
7,0.27894,"(escape, carry)"
8,0.4627,"(initiator, carry)"
9,0.55477,"(nuker, carry)"


In [19]:
rules = association_rules(frequent_champsets, metric="confidence", min_threshold=0.65)
rule_frame = pd.DataFrame(rules[['antecedents', 'consequents', 'support', 'confidence']])
rule_frame = rule_frame.sort_values(['confidence'], ascending=[0])
rule_frame

Unnamed: 0,antecedents,consequents,support,confidence
4,(support),(carry),0.59769,0.863389
2,(initiator),(carry),0.4627,0.840036
3,(nuker),(carry),0.55477,0.836265
0,(disabler),(carry),0.27262,0.830931
7,"(initiator, support)",(carry),0.29194,0.827541
8,"(nuker, support)",(carry),0.34896,0.820754
6,"(initiator, nuker)",(carry),0.27247,0.792686
1,(escape),(carry),0.27894,0.765351
5,(carry),(support),0.59769,0.687213
