In [7]:
import numpy as np
import pandas as pd
from apyori import apriori

In [8]:
# Read in data into a dataframe number of rows - 6358
df = pd.read_csv('../../datasets/soccer_player_embeddings_v1.csv')
df.head(2)

Unnamed: 0,id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC12,PC13,PC14,player_fifa_api_id,player_api_id,overall_rating,potential,player_name,birthday,player_positions
0,97455,-0.020058,0.683692,-0.25527,-0.329416,-0.594869,0.124635,-0.209664,0.304167,-0.082228,...,0.047019,-0.091479,0.273118,119152,30831,78.0,78.0,Kolo Toure,1981-03-19 00:00:00,CB
1,42881,0.068503,0.188564,-0.228424,-0.542123,-0.304828,0.015172,0.001972,-0.151031,-0.113127,...,-0.188535,0.25828,-0.195703,199284,133126,63.0,63.0,Dennis Hediger,1986-09-22 00:00:00,"CDM, CM, CAM"


In [9]:
# Convert each row's player_positions from a string to a list
def convert_to_list(position_str):
    position_ls = sorted(position_str.split(", "))
    return position_ls

df['player_positions_list'] = df['player_positions'].apply(lambda x: convert_to_list(x))

In [10]:
# Convert player_positions_list into list of list for ARM
records = []
for i in range(0, len(df)):
    records.append(df['player_positions_list'][i])

In [11]:
# Min support chosen as 0.004 because 27.6 (mean of value counts) /6358 --> ~0.0043
association_rules = apriori(records, min_support=0.0043, min_confidence=0.5, min_length=2)
association_results = list(association_rules)

In [12]:
for item in association_results:

    #second index of the inner list
    support = str(item[1])
    
    print("Frequent itemset: {}".format(list(item[0])))
    

Frequent itemset: ['CM', 'CDM']
Frequent itemset: ['CF', 'ST']
Frequent itemset: ['LWB', 'LB']
Frequent itemset: ['CM', 'CAM', 'CDM']
Frequent itemset: ['RM', 'CAM', 'LM']
Frequent itemset: ['CB', 'CDM', 'CM']
Frequent itemset: ['CM', 'CDM', 'RM']


In [18]:
support_ls = []
frequent_itemsets = []
antecedent_ls = []
consequent_ls = []
confidence_ls = []
lift_ls = []

for item in association_results:
    #third index of the list located at 0th
    #of the third index of the inner list
    for stats in item[2]:
        support_ls.append(item[1])
        frequent_itemsets.append(list(item[0]))
        antecedent_ls.append(list(stats[0]))
        consequent_ls.append(list(stats[1]))
        confidence_ls.append(stats[2])
        lift_ls.append(stats[3])

In [21]:
d = {'freq_itemset': frequent_itemsets, 
     'antecedent': antecedent_ls,
     'consequent': consequent_ls, 
     'support': support_ls,
     'confidence': confidence_ls, 
     'lift': lift_ls}

arm_df = pd.DataFrame(data=d)
arm_df

Unnamed: 0,freq_itemset,antecedent,consequent,support,confidence,lift
0,"[CM, CDM]",[CDM],[CM],0.119534,0.692168,3.318855
1,"[CM, CDM]",[CM],[CDM],0.119534,0.573152,3.318855
2,"[CF, ST]",[CF],[ST],0.019031,0.528384,2.903602
3,"[LWB, LB]",[LWB],[LB],0.004718,0.681818,5.749337
4,"[CM, CAM, CDM]","[CAM, CDM]",[CM],0.017301,0.846154,4.057199
5,"[RM, CAM, LM]","[CAM, LM]",[RM],0.021076,0.529644,3.668277
6,"[RM, CAM, LM]","[CAM, RM]",[LM],0.021076,0.55144,3.869821
7,"[CB, CDM, CM]","[CB, CM]",[CDM],0.009122,0.805556,4.664592
8,"[CM, CDM, RM]","[CDM, RM]",[CM],0.005348,0.772727,3.705128
