In [1]:
import numpy as np
import pandas as pd
from apyori import apriori
import os

In [33]:
# Read in data into a dataframe number of rows - 6358
file = 'soccer_player_embeddings_feature_no_labels.csv'
df = pd.read_csv('../../datasets/final_embeddings/{}'.format(file))

# Create folder to store ARM results
SAVE_PATH = './ARM_results'

if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)

In [34]:
# Play around with data
# Convert each row's player_positions from a string to a list
def convert_to_list(position_str):
    position_ls = sorted(position_str.split(", "))
    return position_ls

df['player_positions_list'] = df['player_positions'].apply(lambda x: convert_to_list(x))

In [35]:
df_mean = df['player_positions'].value_counts().mean()
n_samples = len(df['player_positions'])
print('Info on Dataset')
print('Mean: {:.1f} \t Number of Samples: {}'.format(df_mean, n_samples))
print('Average Frequency: {:.4f}'.format(df_mean/n_samples))

Info on Dataset
Mean: 11.8 	 Number of Samples: 6358
Average Frequency: 0.0019


In [36]:
# Set parameters
MIN_SUPPORT = 0.0043
MIN_CONFIDENCE = 0.5
MIN_LENGTH = 2

old_pos = df['player_positions']

In [37]:
# function that takes in original player positions label and return the combined player positions using ARM
def assoc_mining(old_pos, replace_all=True, save_link=SAVE_PATH):
    # Convert df to list for ARM
    old_pos_df = old_pos.apply(lambda x: sorted(x.split(", ")))
    old_pos_ls = list(old_pos_df)
    
    # Do ARM
    # Min support chosen as 0.004 because 27.6 (mean of value counts) /6358 --> ~0.0043
    association_rules = list(apriori(old_pos_ls, min_support=MIN_SUPPORT, 
                                     min_confidence=MIN_CONFIDENCE, min_length=MIN_LENGTH))
    
    arm_df = pd.DataFrame(columns=['freq_itemset', 'antecedent','consequent','support','confidence', 'lift'])
    freq_itemsets = []
    # Append results of ARM to pandas DF
    for item in association_rules:
        freq_itemsets.append(list(item[0]))
        for stats in item[2]:
            new_row = {'freq_itemset': list(item[0]),
                       'antecedent': list(stats[0]),
                       'consequent': list(stats[1]),
                       'support': item[1],
                       'confidence': stats[2],
                       'lift': stats[3]}
            arm_df = arm_df.append(new_row, ignore_index=True)
    if save_link:
        arm_df.to_csv('{}/ARM_results.csv'.format(save_link), header=True, index = False)
    
    new_pos_df = old_pos_df.copy()
    
    for target in freq_itemsets[:3]:
        rep_str = '_'.join(sorted(target))
        for i, row in enumerate(old_pos_df):
            if replace_all:
                # Replace as long as one of the item in frequent itemset exists
                if any(item in target for item in row):
                    new_row = [x for x in row if x not in target]
                    new_row.append(rep_str)
                    new_pos_df[i] = sorted(new_row)
            else:
                # Only replace if all items in frequent itemset exist
                if all(item in target for item in row):
                    new_row = [x for x in row if x not in target]
                    new_row.append(rep_str)
                    new_pos_df[i] = sorted(new_row)
        
            
    # Convert new position df to a list
    new_pos = list(new_pos_df)
    
    return new_pos, new_pos_df, arm_df

In [38]:
# Select the player_positions column and input into assoc_mining function
new_pos, new_pos_df, arm_df = assoc_mining(old_pos)

In [39]:
# Check the results from ARM
arm_df

Unnamed: 0,freq_itemset,antecedent,consequent,support,confidence,lift
0,"[CM, CDM]",[CDM],[CM],0.119534,0.692168,3.318855
1,"[CM, CDM]",[CM],[CDM],0.119534,0.573152,3.318855
2,"[CF, ST]",[CF],[ST],0.019031,0.528384,2.903602
3,"[LWB, LB]",[LWB],[LB],0.004718,0.681818,5.749337
4,"[CM, CDM, CAM]","[CDM, CAM]",[CM],0.017301,0.846154,4.057199
5,"[RM, LM, CAM]","[LM, CAM]",[RM],0.021076,0.529644,3.668277
6,"[RM, LM, CAM]","[RM, CAM]",[LM],0.021076,0.55144,3.869821
7,"[CB, CM, CDM]","[CB, CM]",[CDM],0.009122,0.805556,4.664592
8,"[CM, CDM, RM]","[CDM, RM]",[CM],0.005348,0.772727,3.705128


In [40]:
# Check the new positions after arm 
new_pos_df.head()

0                 [CB]
1             [LB_LWB]
2             [CB, RB]
3                 [CB]
4    [CAM, CDM_CM, LM]
Name: player_positions, dtype: object