In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

# Sumo Matches Forecast

https://data.world/cervus/sumo-japan

https://josh-sulkers.medium.com/building-a-sumo-wrestling-match-predictor-using-machine-learning-2ac95c5e20a3

first wrestler result after the bout (final result in brackets)





Wrestler performance can be viewed from multiple prospectives:
- the overall career wins 
- the wins in the current tournament; each tournament is 15 days, 1 bout / day
- bouts vs current opponent
This way we can capture the overall, current, and vs opponent performance of the rikishi.

We can use several metrics: 
- absolute number of wins
- proportion of wins
- weighted moving average (WMA) (e.g. last 4 bouts)
    - exponentially weighed
    - other (custom) weights

Absolute number. WMA recent observations are given relatively more weight than older ones.


**Features**

Physical traits
- age
- height
- weight
- body mass index - calculated

Wrestler performance
- wins vs current opponent, exponentially WMA of wins for the last 10 bouts
- wins vs 


career_wins1
tournament_wins1
enemy_wins


We will treat the problem as a 2-class-value classification problem:
- class 0 - wrestler1 loses (the opponent wins)
- class 1 - wrestler1 wins



## Prepare data

### Preprocess tournament data

In [327]:
# Read tournament records

results = pd.read_csv("data/results.csv")
results.columns = [
    'basho', 'day',
    'id1', 'rank1', 'name1', 'tournament_score1', 'bout_outcome1',
    'kimarite',
    'id2', 'rank2', 'name2', 'tournament_score2', 'bout_outcome2',
]
results.tail()

Unnamed: 0,basho,day,id1,rank1,name1,tournament_score1,bout_outcome1,kimarite,id2,rank2,name2,tournament_score2,bout_outcome2
224775,2022.09,15,12370,S1e,Wakatakakage,11-4,1,yorikiri,2879,M5w,Sadanoumi,9-6,0
224776,2022.09,15,11980,M6e,Wakamotoharu,10-5,1,yorikiri,12210,O2w,Mitakeumi,4-11,0
224777,2022.09,15,12210,O2w,Mitakeumi,4-11,0,yorikiri,11980,M6e,Wakamotoharu,10-5,1
224778,2022.09,15,12130,O1w,Shodai,4-11,0,oshidashi,12191,O1e,Takakeisho,10-5,1
224779,2022.09,15,12191,O1e,Takakeisho,10-5,1,oshidashi,12130,O1w,Shodai,4-11,0


In [328]:
# Filter out Makuuchi division

makuuchi_regex = r"([YOSKM]+[1-9]+)"
results.rank1 = results.rank1.str.extract(makuuchi_regex, expand=True)
results.rank2 = results.rank2.str.extract(makuuchi_regex, expand=True)

results = results.dropna().reset_index(drop=True)

In [329]:
# # Drop duplicate matches

# results.loc[results.rikishi1_id > results.rikishi2_id, "match_id"] = \
# results.basho.astype(str) + "," + results.day.astype(str) + "," + results.rikishi1_id.astype(str) + "," + results.rikishi2_id.astype(str)

# results.loc[results.rikishi1_id < results.rikishi2_id, "match_id"] = \
# results.basho.astype(str) + "," + results.day.astype(str) + "," + results.rikishi2_id.astype(str) + "," + results.rikishi1_id.astype(str)

# results = results.drop_duplicates(subset="match_id")
# results = results.drop(columns="match_id")
# results = results.reset_index(drop=True)

In [330]:
# Extract year and month columns

def get_year_and_month_from_date(date_column):
    year_and_month = date_column.astype(str).str.split(".", expand=True)
    year_and_month.columns = ["year", "month"]
    year_and_month = year_and_month.astype(int)
    
    return year_and_month

year_and_month = get_year_and_month_from_date(results.basho)
results = pd.concat([results, year_and_month], axis=1)
results = results.drop(columns="basho")

In [331]:
# Create columns for tournament wins and losses (before the current bout)

score_regex = r"([0-9]+-[0-9]+)"

# Wrestler1
wins_and_losses = results.tournament_score1.str.extract(score_regex)[0]
wins_and_losses = wins_and_losses.str.split("-", expand=True)
wins_and_losses.columns = ["tournament_wins1", "tournament_losses1"]
wins_and_losses = wins_and_losses.astype(int)
results = pd.concat([results, wins_and_losses], axis=1)

# Subtract the current bout outcome to get wins/losses correct count before the fight
results.tournament_wins1 -= results.bout_outcome1
results.tournament_losses1 -= results.bout_outcome2


# Repeat for wrestler2
wins_and_losses = results.tournament_score2.str.extract(score_regex)[0]
wins_and_losses = wins_and_losses.str.split("-", expand=True)
wins_and_losses.columns = ["tournament_wins2", "tournament_losses2"]
wins_and_losses = wins_and_losses.astype(int)
results = pd.concat([results, wins_and_losses], axis=1)
results.tournament_wins2 -= results.bout_outcome2
results.tournament_losses2 -= results.bout_outcome1

In [332]:
# Encode the wrestler rank

power_ranks = pd.read_csv("data/power_rank.csv")
power_ranks_dict = dict(zip(power_ranks["rank"], power_ranks["power"]))

results.rank1 = results.rank1.replace(power_ranks_dict)
results.rank2 = results.rank2.replace(power_ranks_dict)

In [333]:
# Drop irrelevant columns
results = results.drop(columns=["tournament_score1", "tournament_score2", "name1", "name2", "kimarite"])

In [334]:
# Re-order columns
results = results[[
    'year',
    'month',
    'day',
    'id1',
    'rank1',
    'bout_outcome1',
    'tournament_wins1',
    'tournament_losses1',
    'id2',
    'rank2',
    'bout_outcome2',
    'tournament_wins2',
    'tournament_losses2',
]]

In [335]:
# Print the clean tournament data
results.tail()

Unnamed: 0,year,month,day,id1,rank1,bout_outcome1,tournament_wins1,tournament_losses1,id2,rank2,bout_outcome2,tournament_wins2,tournament_losses2
133519,2022,9,15,12370,19,1,10,4,2879,13,0,9,5
133520,2022,9,15,11980,12,1,9,5,12210,20,0,4,10
133521,2022,9,15,12210,20,0,4,10,11980,12,1,9,5
133522,2022,9,15,12130,20,0,4,10,12191,20,1,9,5
133523,2022,9,15,12191,20,1,9,5,12130,20,0,4,10


### Preprocess wrestler data

In [366]:
# Read wrestler records

banzuke = pd.read_csv("data/banzuke.csv")
banzuke.tail()

Unnamed: 0,basho,id,rank,rikishi,heya,shusshin,birth_date,height,weight,prev,prev_w,prev_l
174180,2022.09,12738,Jk17e,Okuyama,Hakkaku,Hokkaido,2003-06-03,179.0,136.0,Mz,2.0,1.0
174181,2022.09,12777,Jk17w,Sachinofuji,Isegahama,Saitama,2006-03-06,168.0,79.0,Mz,1.0,2.0
174182,2022.09,677,Jk18e,Itakozakura,Shikihide,Ibaraki,1978-10-02,178.0,139.0,Jk2e,0.0,2.0
174183,2022.09,2905,Jk18w,Higohikari,Kise,Kumamoto,1988-01-27,172.0,113.0,Jk8w,0.0,7.0
174184,2022.09,12546,Jk19e,Hamasaki,Otake,Fukuoka,2000-09-30,177.0,135.0,Mz,0.0,3.0


In [367]:
# Filter out Makuuchi division

banzuke["rank"] = banzuke["rank"].str.extract(makuuchi_regex, expand=True)
banzuke = banzuke.dropna(subset=["rank"])
banzuke = banzuke.reset_index(drop=True)

In [368]:
# Extract year and month

year_and_month = get_year_and_month_from_date(banzuke.basho)
banzuke = pd.concat([banzuke, year_and_month], axis=1)

In [369]:
# Add column `age`

banzuke.basho = pd.to_datetime(banzuke.basho.astype(str), format="%Y.%m")
banzuke.birth_date = pd.to_datetime(banzuke.birth_date)
banzuke["age"] = banzuke.basho - banzuke.birth_date
# convert to years
banzuke["age"] = banzuke.age / np.timedelta64(1, "Y")

In [370]:
# Add column body mass index

banzuke["bmi"] = 1.3 * banzuke.weight / np.power(banzuke.height / 100, 2.5)

In [371]:
# Drop and re-order columns

banzuke = banzuke.drop(columns=[
    "basho",
    "birth_date", 
    "rank",
    "rikishi",
    "heya", 
    "shusshin", 
    "prev",
])
banzuke.columns = [
    'id',
    'height',
    'weight',
    'previous_tournament_wins',
    'previous_tournament_losses',
    'year',
    'month',
    'age',
    'bmi',
]
banzuke = banzuke[[
    'year',
    'month',
    'id',
    'height',
    'weight',
    'bmi',
    'age',
    'previous_tournament_wins',
    'previous_tournament_losses',
]]

In [373]:
# Fix data types

banzuke["id"] = banzuke["id"].astype(int)
banzuke.previous_tournament_wins = banzuke.previous_tournament_wins.astype(int)
banzuke.previous_tournament_losses = banzuke.previous_tournament_losses.astype(int)

In [374]:
# Print the clean wrestler data
banzuke.tail()

Unnamed: 0,year,month,id,height,weight,bmi,age,previous_tournament_wins,previous_tournament_losses
9581,2022,9,12292,185.0,173.0,48.312569,28.942415,8,7
9582,2022,9,11868,169.0,114.0,39.914569,27.622744,6,9
9583,2022,9,12113,182.0,191.0,55.564607,31.099886,5,8
9584,2022,9,12406,187.0,183.0,49.749695,28.353765,9,6
9585,2022,9,12314,177.0,129.0,40.234619,22.365962,10,5


### Merge tournament and wrestler data

In [428]:
# Merge tournament and wrestler data

# Add wrestler 1 data
sumo = pd.merge(
    left = results,
    right = banzuke.add_suffix("1"),
    how = "left",
    left_on = ["year", "month", "id1"],
    right_on = ["year1", "month1", "id1"],
)
sumo = sumo.drop(columns=["year1", "month1",])

# Add wrestler 2 data
sumo = pd.merge(
    left = sumo,
    right = banzuke.add_suffix("2"),
    how = "left",
    left_on = ["year", "month", "id2"],
    right_on = ["year2", "month2", "id2"],
)
sumo = sumo.drop(columns=["year2", "month2",])

In [432]:
# Reorder columns

wrestler1_columns = sorted(list(sumo.columns[sumo.columns.str.contains("1")]))
wrestler2_columns = sorted(list(sumo.columns[sumo.columns.str.contains("2")]))
other_columns = ['year', 'month', 'day']

sumo = sumo[other_columns + wrestler1_columns + wrestler2_columns]

In [438]:
# Final clean `sumo` data

sumo.tail(2)

Unnamed: 0,year,month,day,age1,bmi1,bout_outcome1,height1,id1,previous_tournament_losses1,previous_tournament_wins1,...,bmi2,bout_outcome2,height2,id2,previous_tournament_losses2,previous_tournament_wins2,rank2,tournament_losses2,tournament_wins2,weight2
133522,2022,9,15,30.823357,48.122448,0,184.0,12130,5,10,...,58.721795,1,175.0,12191,4,11,20,5,9,183.0
133523,2022,9,15,26.073088,58.721795,1,175.0,12191,4,11,...,48.122448,0,184.0,12130,5,10,20,10,4,170.0


In [436]:
# Check for missing data
print(sumo.isna().sum())

year                           0
month                          0
day                            0
age1                           0
bmi1                           0
bout_outcome1                  0
height1                        0
id1                            0
previous_tournament_losses1    0
previous_tournament_wins1      0
rank1                          0
tournament_losses1             0
tournament_wins1               0
weight1                        0
age2                           0
bmi2                           0
bout_outcome2                  0
height2                        0
id2                            0
previous_tournament_losses2    0
previous_tournament_wins2      0
rank2                          0
tournament_losses2             0
tournament_wins2               0
weight2                        0
dtype: int64


In [437]:
# Verify data types
print(sumo.dtypes)

year                             int32
month                            int32
day                              int64
age1                           float64
bmi1                           float64
bout_outcome1                    int64
height1                        float64
id1                              int64
previous_tournament_losses1      int32
previous_tournament_wins1        int32
rank1                            int64
tournament_losses1               int64
tournament_wins1                 int64
weight1                        float64
age2                           float64
bmi2                           float64
bout_outcome2                    int64
height2                        float64
id2                              int64
previous_tournament_losses2      int32
previous_tournament_wins2        int32
rank2                            int64
tournament_losses2               int64
tournament_wins2                 int64
weight2                        float64
dtype: object


## Feature engineering

### Total percentage of won matches

`career_wins1` - total number of career wins in the top makuuchi division for this rikishi.

The values are first calculated as the cumulative sum of the bout outcomes for this rikishi. ThenThe values are shifted 1 row down to ensure the 

In [151]:
# Calculate win_percentage

new_sumo = pd.DataFrame()

for _, group_data in sumo.groupby("id1"):
    group_data["career_wins1"] = group_data.bout_outcome1.cumsum()
    # make values before the match valid
    group_data.loc[:, "career_wins1"] = group_data.loc[:, "career_wins1"].shift(periods=1).fillna(0)
    # fill in the leftover first value that is 'na'
    group_data.loc[:, "total_wins1"] = group_data.loc[:, "total_wins1"]

    # total played matches
    group_data["total_matches1"] = np.arange(0, len(group_data))

    # total win percentage
    group_data["win_percentage1"] = group_data["total_wins1"] / group_data["total_matches1"]

    # average the win pct for the first 10 records
    group_data.loc[group_data.total_matches1 < 10, "win_percentage1"] = group_data.win_percentage1.iloc[:10].mean()
    
    # append data
    new_sumo = pd.concat([new_sumo, group_data], axis=0)

sumo = new_sumo.sort_index()

In [151]:
# Calculate win_percentage

new_sumo = pd.DataFrame()

for _, group_data in sumo.groupby("id1"):
    # total career wins prior to the start of each match
    group_data["total_wins1"] = group_data.win1.cumsum()
    # make values before the match valid
    group_data.loc[:, "total_wins1"] = group_data.loc[:, "total_wins1"].shift(periods=1)
    # fill in the leftover first value that is 'na'
    group_data.loc[:, "total_wins1"] = group_data.loc[:, "total_wins1"].fillna(0)

    # total played matches
    group_data["total_matches1"] = np.arange(0, len(group_data))

    # total win percentage
    group_data["win_percentage1"] = group_data["total_wins1"] / group_data["total_matches1"]

    # average the win pct for the first 10 records
    group_data.loc[group_data.total_matches1 < 10, "win_percentage1"] = group_data.win_percentage1.iloc[:10].mean()
    
    # append data
    new_sumo = pd.concat([new_sumo, group_data], axis=0)

sumo = new_sumo.sort_index()

In [159]:
# # Calculate the total win percentage

# # total career wins prior to the start of each match
# sumo["total_wins1"] = sumo[sumo.id1 == 12130].win1.cumsum()
# sumo.loc[sumo.id1 == 12130, "total_wins1"] = sumo.loc[sumo.id1 == 12130, "total_wins1"].shift(periods=1)
# sumo.loc[sumo.id1 == 12130, "total_wins1"] = sumo.loc[sumo.id1 == 12130, "total_wins1"].fillna(0)

# # total played matches
# sumo.loc[sumo.id1 == 12130, "total_matches1"] = np.arange(0, len(sumo[sumo.id1 == 12130]))

# # total win percentage
# sumo["win_percentage1"] = sumo.loc[sumo.id1 == 12130, "total_wins1"] / sumo.loc[sumo.id1 == 12130, "total_matches1"]

# # average the win pct for the first 10 records
# sumo.loc[(sumo.id1 == 12130) & (sumo.total_matches1 < 10), "win_percentage1"] = \
# sumo.loc[sumo.id1 == 12130, "win_percentage1"].iloc[:10].mean()

# # # show result
# # sumo[sumo.id1 == 12130][['year', 'month', 'day', 'id1', 'rank1', 'win1', 'curr_wins1',
# #        'curr_losses1', 'height1', 'weight1', 'age1', 'prev_wins1',
# #        'prev_losses1', 'total_wins1', "total_matches1", "win_percentage1"]][:20]

### Total percentage of won matches  

In [279]:
# Custom vs buildin exponential weighted mean

dummy = sumo.groupby("pair_id").get_group(12131)
pd.DataFrame({
    "win1": dummy.win1,
    "custom": dummy.win1.rolling(window=len(weights), min_periods=1, closed="left").apply(weighted_sum).fillna(0.5),
    "ewm": dummy.win1.ewm(span=4).mean().shift(1).fillna(0.5),
})

Unnamed: 0,win1,custom,ewm
74297,1,0.5,0.5
74535,0,1.0,1.0
75823,1,0.333333,0.375
77375,0,0.712644,0.693878
82117,0,0.326087,0.375
85069,1,0.130435,0.212353
85905,1,0.597826,0.54283
86195,1,0.815217,0.730965
87213,1,0.945652,0.840417
88647,0,1.0,0.9049


In [None]:
# Calculate `winning_streak` - the weighted average of the last 4 fights vs the same opponent

# Encode distinct pairs, matches of wrestlers
match_ids = sumo.id1.astype(str) + ", " + sumo.id2.astype(str)
codes, uniques = pd.factorize(match_ids)
sumo["pair_id"] = codes

# Define the exponential mean function
weights = np.array([0.05, 0.12, 0.25, 0.50])
def weighted_sum(subset):
    w = weights[4 - len(subset):]
    return np.sum(w * subset) / np.sum(w)

In [None]:
# Calculate `winning_streak` vs oponent
result = pd.Series(name="winning_streak", dtype="float64")
for _, group_data in sumo.groupby("pair_id").win1:
    winning_streak = group_data.rolling(window=len(weights), min_periods=1, closed="left").apply(weighted_sum)
    result = pd.concat([result, winning_streak])

result = result.fillna(value=0.5)
sumo["winning_streak1"] = result

In [251]:
# Calculate `tournament_performance` (same idea as above)
result = pd.Series(name="tournament_performance", dtype="float64")
for _, group_data in sumo.groupby(["year", "month"]).win1:
    tournament_performance = group_data.rolling(window=len(weights), min_periods=1, closed="left").apply(weighted_sum)
    result = pd.concat([result, tournament_performance])
    
result = result.fillna(value=0.5)
sumo["tournament_performance1"] = result

In [280]:
sumo

Unnamed: 0,year,month,day,id1,rank1,win1,curr_wins1,curr_losses1,height1,weight1,...,weight2,age2,prev_wins2,prev_losses2,total_wins1,total_matches1,win_percentage1,pair_id,winning_streak1,tournament_performance1
0,1983,1,1,4122,4,0,0,0,184.0,130.0,...,117.0,27.685716,8,7,0.0,0,0.015625,0,0.500000,0.500000
1,1983,1,1,4131,5,1,0,0,171.0,117.0,...,130.0,31.639253,6,9,0.0,0,0.677557,1,0.500000,0.000000
2,1983,1,1,1348,5,0,0,0,187.0,126.0,...,122.0,27.384546,7,8,0.0,0,0.396693,2,0.500000,0.666667
3,1983,1,1,4124,6,1,0,0,189.5,122.0,...,126.0,23.023060,10,5,0.0,0,0.810229,3,0.500000,0.287356
4,1983,1,1,1334,7,1,0,0,188.0,143.0,...,176.0,29.010863,10,5,0.0,0,0.806526,4,0.500000,0.673913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133519,2022,9,15,12370,19,1,10,4,180.0,129.0,...,139.0,35.310787,7,8,121.0,199,0.608040,20449,1.000000,0.271739
133520,2022,9,15,11980,12,1,9,5,187.0,143.0,...,172.0,29.684388,2,5,42.0,74,0.567568,21524,0.500000,0.673913
133521,2022,9,15,12210,20,0,4,10,179.0,172.0,...,143.0,28.906822,6,9,348.0,602,0.578073,21525,0.500000,0.869565
133522,2022,9,15,12130,20,0,4,10,184.0,170.0,...,183.0,26.073088,11,4,313.0,589,0.531409,18852,0.130435,0.402174
