In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [3]:
goal = pd.read_csv("./ronaldo_goal_prediction.csv")

In [4]:
goal

Unnamed: 0,Opponent Strength,Match Location,Ronaldo’s Form,Team Support,Scored
0,Strong,Away,Good,High,No
1,Weak,Home,Excellent,High,Yes
2,Medium,Away,Average,Low,No
3,Weak,Home,Good,Medium,Yes
4,Strong,Home,Excellent,High,Yes
5,Medium,Away,Poor,Low,No
6,Weak,Away,Good,High,Yes
7,Medium,Home,Average,Medium,No
8,Strong,Home,Excellent,Medium,Yes
9,Weak,Away,Poor,High,No


In [5]:
goal.describe()

Unnamed: 0,Opponent Strength,Match Location,Ronaldo’s Form,Team Support,Scored
count,14,14,14,14,14
unique,3,2,4,3,2
top,Weak,Home,Good,High,Yes
freq,5,8,4,7,8


In [6]:
class TreeNode:
  def __init__(self, value):
    self.value = value
    self.left = None
    self.right = None

In [24]:
def Gini(source, x, y):
  data = source[[x, y]]
  #print(data)
  
  unique_feature_vals = data[x].unique()
  unique_target_vals = data[y].unique()
  
  stats = pd.DataFrame(index=unique_feature_vals, columns=[*unique_target_vals, "total"])
  
  length = len(data)
  
  for value in unique_feature_vals:
    total = 0
    for target_value in unique_target_vals:
      count = len(data[(data[x] == value) & (data[y] == target_value)])
      stats.loc[value, target_value] = count
      total += count
    stats.loc[value, "total"] = total
  
  #print(stats)
  
  gini_by_feat = pd.DataFrame(
    index=unique_feature_vals, 
    data={"gini": np.zeros(len(unique_feature_vals))})
  
  for value in unique_feature_vals:
    p = stats.loc[value, unique_target_vals] / stats.loc[value, "total"]
    gini_by_feat.loc[value, "gini"] = 1 - sum(p**2)
  
  p_cat = stats["total"] / length
  
  return (gini_by_feat, sum(np.array(gini_by_feat["gini"]) * p_cat))  

In [23]:
X = goal.drop(columns=["Scored"])
y = goal["Scored"]

gini_vals = []

for column in X.columns:
    g = Gini(goal, column, "Scored")[1]
    gini_vals.append(g)

gini = pd.DataFrame({"gini": gini_vals}, index=X.columns)

min_idx = gini.loc[gini.idxmin()].index[0]
print(min_idx)

Ronaldo’s Form


In [68]:
gini_by_feat, total_gini = Gini(goal, min_idx, "Scored")

print(gini_by_feat)
print(total_gini)


            gini
Good       0.375
Excellent  0.000
Average    0.375
Poor       0.000
0.21428571428571427


In [91]:
import itertools
import numpy as np

def get_subset(data, x):
    unique = data[x].unique()
    subset = []
    seen = set()

    for i in range(1, len(unique)):
        for combo in itertools.combinations(unique, i):
            left = tuple(sorted(combo))
            right = tuple(sorted(np.setdiff1d(unique, combo)))
            
            pair = tuple(sorted([left, right]))
            
            if pair not in seen:
                seen.add(pair)
                subset.append((np.array(left), np.array(right)))

    return subset

In [96]:
subset = get_subset(goal, "Ronaldo’s Form")

subset

[(array(['Good'], dtype='<U4'),
  array(['Average', 'Excellent', 'Poor'], dtype='<U9')),
 (array(['Excellent'], dtype='<U9'),
  array(['Average', 'Good', 'Poor'], dtype='<U7')),
 (array(['Average'], dtype='<U7'),
  array(['Excellent', 'Good', 'Poor'], dtype='<U9')),
 (array(['Poor'], dtype='<U4'),
  array(['Average', 'Excellent', 'Good'], dtype='<U9')),
 (array(['Excellent', 'Good'], dtype='<U9'),
  array(['Average', 'Poor'], dtype='<U7')),
 (array(['Average', 'Good'], dtype='<U7'),
  array(['Excellent', 'Poor'], dtype='<U9')),
 (array(['Good', 'Poor'], dtype='<U4'),
  array(['Average', 'Excellent'], dtype='<U9'))]

In [97]:
len(subset)

7

In [133]:
def subset_Gini(source, x, y, left_split, right_split):
  data = source[[x, y]]
  
  unique_feature_vals = data[x].unique()
  unique_target_vals = data[y].unique()
  
  stats = pd.DataFrame(index=unique_feature_vals, columns=[*unique_target_vals, "total"])
  
  length = len(data)
  
  for value in unique_feature_vals:
    total = 0
    for target_value in unique_target_vals:
      count = len(data[(data[x] == value) & (data[y] == target_value)])
      stats.loc[value, target_value] = count
      total += count
    stats.loc[value, "total"] = total
  
  left_split_stats = stats.loc[left_split]
  right_split_stats = stats.loc[right_split]
  
  right_split_total = right_split_stats["total"].sum()
  left_split_total = left_split_stats["total"].sum()
  
  p_right = []
  for value in right_split:
    count = 0
    for target_value in unique_target_vals:
      count += stats.loc[value, target_value]
    p_right.append(count / right_split_total)
  p_right = np.array(p_right)
  
    
  p_left = []
  for value in left_split:
    count = 0
    for target_value in unique_target_vals:
      count += stats.loc[value, target_value]
    p_left.append(count / left_split_total)
  p_left = np.array(p_left)
  
  left_gini = 1 - sum(p_left**2)
  right_gini = 1 - sum(p_right**2)
  
  total_gini = right_split_total / length * right_gini + left_split_total / length * left_gini
  
  return total_gini
  

In [None]:

subset

subset_Gini(goal, "Ronaldo’s Form", "Scored", ['Good', 'Poor'], ['Average', 'Excellent'])

     No Yes total
Good  1   3     4
Poor  2   0     2
          No Yes total
Average    3   1     4
Excellent  0   4     4
6
8
0.4444444444444444
0.5
0.47619047619047616


In [142]:
gini_subset_stats = pd.DataFrame(columns={"name": [], "gini": []})

for item in subset:
  left_split = item[1]
  right_split = item[0]

  name = f"{left_split} vs {right_split}"
  gini = subset_Gini(goal, min_idx, "Scored", left_split, right_split)
  
  gini_subset_stats.loc[len(gini_subset_stats)] = [name, gini]
  
gini_subset_stats

Unnamed: 0,name,gini
0,['Average' 'Excellent' 'Poor'] vs ['Good'],0.457143
1,['Average' 'Good' 'Poor'] vs ['Excellent'],0.457143
2,['Excellent' 'Good' 'Poor'] vs ['Average'],0.457143
3,['Average' 'Excellent' 'Good'] vs ['Poor'],0.571429
4,['Average' 'Poor'] vs ['Excellent' 'Good'],0.47619
5,['Excellent' 'Poor'] vs ['Average' 'Good'],0.47619
6,['Average' 'Excellent'] vs ['Good' 'Poor'],0.47619


In [143]:
best_split = gini_subset_stats.loc[gini_subset_stats["gini"].idxmin()]
print(best_split)

name    ['Average' 'Excellent' 'Poor'] vs ['Good']
gini                                      0.457143
Name: 0, dtype: object
