In [1]:
#Check your current working directory
import os
os.getcwd()

#If you need to change your current working directory, you can
#os.chdir('working directory path')

'C:\\Users\\jayam\\DW_2024'

In [2]:
#Import necessary libraries, pandas, numpy and mlxtend
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
#Read the csv file
df = pd.read_csv('olympic_medals.csv')

#print the df
print(df)

      discipline_title     slug_game                      event_title  \
0              Curling  beijing-2022                    Mixed Doubles   
1              Curling  beijing-2022                    Mixed Doubles   
2              Curling  beijing-2022                    Mixed Doubles   
3              Curling  beijing-2022                    Mixed Doubles   
4              Curling  beijing-2022                    Mixed Doubles   
...                ...           ...                              ...   
21692    Weightlifting   athens-1896  heavyweight - one hand lift men   
21693    Weightlifting   athens-1896  heavyweight - one hand lift men   
21694    Weightlifting   athens-1896  heavyweight - two hand lift men   
21695    Weightlifting   athens-1896  heavyweight - two hand lift men   
21696    Weightlifting   athens-1896  heavyweight - two hand lift men   

      event_gender medal_type participant_type participant_title  \
0            Mixed       GOLD         GameTeam         

In [4]:
#Show the data head
print(df.head())

# Check the data types in the dataframe
print(df.dtypes)

# #The columns of the dataframe
print(df.columns)

# #Check the number of rows and columns
print(df.shape)

  discipline_title     slug_game    event_title event_gender medal_type  \
0          Curling  beijing-2022  Mixed Doubles        Mixed       GOLD   
1          Curling  beijing-2022  Mixed Doubles        Mixed       GOLD   
2          Curling  beijing-2022  Mixed Doubles        Mixed     SILVER   
3          Curling  beijing-2022  Mixed Doubles        Mixed     SILVER   
4          Curling  beijing-2022  Mixed Doubles        Mixed     BRONZE   

  participant_type participant_title  \
0         GameTeam             Italy   
1         GameTeam             Italy   
2         GameTeam            Norway   
3         GameTeam            Norway   
4         GameTeam            Sweden   

                                         athlete_url     athlete_full_name  \
0  https://olympics.com/en/athletes/stefania-cons...  Stefania CONSTANTINI   
1      https://olympics.com/en/athletes/amos-mosaner          Amos MOSANER   
2  https://olympics.com/en/athletes/kristin-skaslien      Kristin SKASLIEN

In [5]:
#Check missing values in the dataframe
print(df.isna())

#Count total missing values at each column in the dataframe
print(df.isna().sum())

       discipline_title  slug_game  event_title  event_gender  medal_type  \
0                 False      False        False         False       False   
1                 False      False        False         False       False   
2                 False      False        False         False       False   
3                 False      False        False         False       False   
4                 False      False        False         False       False   
...                 ...        ...          ...           ...         ...   
21692             False      False        False         False       False   
21693             False      False        False         False       False   
21694             False      False        False         False       False   
21695             False      False        False         False       False   
21696             False      False        False         False       False   

       participant_type  participant_title  athlete_url  athlete_full_name 

In [6]:
# Drop multiple columns, which are not useful and have NAs
new_df = df.drop(['participant_title', 'athlete_url','athlete_full_name','country_code','country_3_letter_code' ], 
                 axis=1)


In [7]:
#Data transformation
#TransactionEncoder() function only can handle string type
new_df = new_df.astype(str)

#TransactionEncoder() was designed to covert lists to array
list = new_df.values.tolist()

#Covert the list to one-hot encoded boolean numpy array. 
#Apriori function allows boolean data type only, such as 1 and 0, or FALSE and TRUE.
te = TransactionEncoder()
array_te = te.fit(list).transform(list)

#Check the array
array_te

#Check the colunms
te.columns_

#Apriori function can handle dataframe only, covert the array to a dataframe
arm_df = pd.DataFrame(array_te, columns = te.columns_)

In [8]:
#Find the frequent itemsets
frequent_itemsets = apriori(arm_df,min_support=0.2,use_colnames =True)

#Check the length of rules
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))

#Assume the length is 2 and the min support is >= 0.3
frequent_itemsets[ (frequent_itemsets['length']==2) & 
                  (frequent_itemsets['support']>=0.3)]

Unnamed: 0,support,itemsets,length
9,0.471033,"(Men, Athlete)",2


In [12]:
#Assume the min confidence is 0.5
rules_con = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.5)
rules_con

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BRONZE),(Athlete),0.347006,0.696548,0.246071,0.709125,1.018056,0.004364,1.043238,0.027161
1,(GOLD),(Athlete),0.327649,0.696548,0.225699,0.688845,0.988941,-0.002524,0.975245,-0.016359
2,(Men),(Athlete),0.642116,0.696548,0.471033,0.733563,1.053141,0.023768,1.138926,0.140994
3,(Athlete),(Men),0.696548,0.642116,0.471033,0.676239,1.053141,0.023768,1.105394,0.166284
4,(SILVER),(Athlete),0.325345,0.696548,0.224778,0.690891,0.991879,-0.00184,0.981699,-0.011991
5,(Women),(Athlete),0.291423,0.696548,0.210859,0.723549,1.038764,0.007869,1.09767,0.052665
6,(BRONZE),(Men),0.347006,0.642116,0.224639,0.647364,1.008172,0.001821,1.01488,0.012413
7,(GOLD),(Men),0.327649,0.642116,0.209246,0.638627,0.994566,-0.001143,0.990344,-0.008061
8,(SILVER),(Men),0.325345,0.642116,0.208232,0.640034,0.996757,-0.000678,0.994215,-0.004799


In [14]:
#Assume the min lift is 1
rules_lift = association_rules(frequent_itemsets, metric="lift",min_threshold=1)
rules_lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BRONZE),(Athlete),0.347006,0.696548,0.246071,0.709125,1.018056,0.004364,1.043238,0.027161
1,(Athlete),(BRONZE),0.696548,0.347006,0.246071,0.353272,1.018056,0.004364,1.009688,0.058446
2,(Men),(Athlete),0.642116,0.696548,0.471033,0.733563,1.053141,0.023768,1.138926,0.140994
3,(Athlete),(Men),0.696548,0.642116,0.471033,0.676239,1.053141,0.023768,1.105394,0.166284
4,(Women),(Athlete),0.291423,0.696548,0.210859,0.723549,1.038764,0.007869,1.09767,0.052665
5,(Athlete),(Women),0.696548,0.291423,0.210859,0.30272,1.038764,0.007869,1.016201,0.122977
6,(Men),(BRONZE),0.642116,0.347006,0.224639,0.349842,1.008172,0.001821,1.004361,0.022648
7,(BRONZE),(Men),0.347006,0.642116,0.224639,0.647364,1.008172,0.001821,1.01488,0.012413


In [15]:
#Based on min confidence (=0.5), 
#output antecedents, consequents, support, confidence and lift.
result_arm = rules_con[['antecedents','consequents','support','confidence','lift']]
result_arm

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(BRONZE),(Athlete),0.246071,0.709125,1.018056
1,(GOLD),(Athlete),0.225699,0.688845,0.988941
2,(Men),(Athlete),0.471033,0.733563,1.053141
3,(Athlete),(Men),0.471033,0.676239,1.053141
4,(SILVER),(Athlete),0.224778,0.690891,0.991879
5,(Women),(Athlete),0.210859,0.723549,1.038764
6,(BRONZE),(Men),0.224639,0.647364,1.008172
7,(GOLD),(Men),0.209246,0.638627,0.994566
8,(SILVER),(Men),0.208232,0.640034,0.996757


In [16]:
#Find the rules whose confidence >= 0.7
new_result_arm = result_arm[result_arm['confidence']>=0.7]
new_result_arm

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(BRONZE),(Athlete),0.246071,0.709125,1.018056
2,(Men),(Athlete),0.471033,0.733563,1.053141
5,(Women),(Athlete),0.210859,0.723549,1.038764
