In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

# Sumo Matches Forecast

https://data.world/cervus/sumo-japan

https://josh-sulkers.medium.com/building-a-sumo-wrestling-match-predictor-using-machine-learning-2ac95c5e20a3

first wrestler result after the bout (final result in brackets)





Wrestler performance can be viewed from multiple prospectives:
- the overall career wins 
- the wins in the current tournament; each tournament is 15 days, 1 bout / day
- bouts vs current opponent
This way we can capture the overall, current, and vs opponent performance of the rikishi.

We can use several metrics: 
- absolute number of wins
- proportion of wins
- weighted moving average (WMA) (e.g. last 4 bouts)
    - exponentially weighed
    - other (custom) weights

Absolute number. WMA recent observations are given relatively more weight than older ones.


**Features**

Physical traits
- age
- height
- weight
- body mass index - calculated

Wrestler performance
- wins vs current opponent, exponentially WMA of wins for the last 10 bouts
- wins vs 


career_wins1
tournament_wins1
enemy_wins


We will treat the problem as a 2-class-value classification problem:
- class 0 - wrestler1 loses (the opponent wins)
- class 1 - wrestler1 wins



## Prepare data

### Preprocess tournament data

In [3]:
# Read tournament records

results = pd.read_csv("data/results.csv")
results.columns = [
    'basho', 'day',
    'id1', 'rank1', 'name1', 'tournament_score1', 'bout_outcome1',
    'kimarite',
    'id2', 'rank2', 'name2', 'tournament_score2', 'bout_outcome2',
]
results.tail()

Unnamed: 0,basho,day,id1,rank1,name1,tournament_score1,bout_outcome1,kimarite,id2,rank2,name2,tournament_score2,bout_outcome2
224775,2022.09,15,12370,S1e,Wakatakakage,11-4,1,yorikiri,2879,M5w,Sadanoumi,9-6,0
224776,2022.09,15,11980,M6e,Wakamotoharu,10-5,1,yorikiri,12210,O2w,Mitakeumi,4-11,0
224777,2022.09,15,12210,O2w,Mitakeumi,4-11,0,yorikiri,11980,M6e,Wakamotoharu,10-5,1
224778,2022.09,15,12130,O1w,Shodai,4-11,0,oshidashi,12191,O1e,Takakeisho,10-5,1
224779,2022.09,15,12191,O1e,Takakeisho,10-5,1,oshidashi,12130,O1w,Shodai,4-11,0


In [4]:
# Filter out Makuuchi division

makuuchi_regex = r"([YOSKM]+[1-9]+)"
results.rank1 = results.rank1.str.extract(makuuchi_regex, expand=True)
results.rank2 = results.rank2.str.extract(makuuchi_regex, expand=True)

results = results.dropna().reset_index(drop=True)

In [5]:
# # Drop duplicate matches

# results.loc[results.rikishi1_id > results.rikishi2_id, "match_id"] = \
# results.basho.astype(str) + "," + results.day.astype(str) + "," + results.rikishi1_id.astype(str) + "," + results.rikishi2_id.astype(str)

# results.loc[results.rikishi1_id < results.rikishi2_id, "match_id"] = \
# results.basho.astype(str) + "," + results.day.astype(str) + "," + results.rikishi2_id.astype(str) + "," + results.rikishi1_id.astype(str)

# results = results.drop_duplicates(subset="match_id")
# results = results.drop(columns="match_id")
# results = results.reset_index(drop=True)

In [6]:
# Extract year and month columns

def get_year_and_month_from_date(date_column):
    year_and_month = date_column.astype(str).str.split(".", expand=True)
    year_and_month.columns = ["year", "month"]
    year_and_month = year_and_month.astype(int)
    
    return year_and_month

year_and_month = get_year_and_month_from_date(results.basho)
results = pd.concat([results, year_and_month], axis=1)
results = results.drop(columns="basho")

In [7]:
# Create columns for tournament wins and losses (before the current bout)

score_regex = r"([0-9]+-[0-9]+)"

# Wrestler1
wins_and_losses = results.tournament_score1.str.extract(score_regex)[0]
wins_and_losses = wins_and_losses.str.split("-", expand=True)
wins_and_losses.columns = ["tournament_wins1", "tournament_losses1"]
wins_and_losses = wins_and_losses.astype(int)
results = pd.concat([results, wins_and_losses], axis=1)

# Subtract the current bout outcome to get wins/losses correct count before the fight
results.tournament_wins1 -= results.bout_outcome1
results.tournament_losses1 -= results.bout_outcome2


# Repeat for wrestler2
wins_and_losses = results.tournament_score2.str.extract(score_regex)[0]
wins_and_losses = wins_and_losses.str.split("-", expand=True)
wins_and_losses.columns = ["tournament_wins2", "tournament_losses2"]
wins_and_losses = wins_and_losses.astype(int)
results = pd.concat([results, wins_and_losses], axis=1)
results.tournament_wins2 -= results.bout_outcome2
results.tournament_losses2 -= results.bout_outcome1

In [8]:
# Encode the wrestler rank

power_ranks = pd.read_csv("data/power_rank.csv")
power_ranks_dict = dict(zip(power_ranks["rank"], power_ranks["power"]))

results.rank1 = results.rank1.replace(power_ranks_dict)
results.rank2 = results.rank2.replace(power_ranks_dict)

In [9]:
# Drop irrelevant columns
results = results.drop(columns=["tournament_score1", "tournament_score2", "name1", "name2", "kimarite"])

In [10]:
# Re-order columns
results = results[[
    'year',
    'month',
    'day',
    'id1',
    'rank1',
    'bout_outcome1',
    'tournament_wins1',
    'tournament_losses1',
    'id2',
    'rank2',
    'bout_outcome2',
    'tournament_wins2',
    'tournament_losses2',
]]

In [11]:
# Print the clean tournament data
results.tail()

Unnamed: 0,year,month,day,id1,rank1,bout_outcome1,tournament_wins1,tournament_losses1,id2,rank2,bout_outcome2,tournament_wins2,tournament_losses2
133519,2022,9,15,12370,19,1,10,4,2879,13,0,9,5
133520,2022,9,15,11980,12,1,9,5,12210,20,0,4,10
133521,2022,9,15,12210,20,0,4,10,11980,12,1,9,5
133522,2022,9,15,12130,20,0,4,10,12191,20,1,9,5
133523,2022,9,15,12191,20,1,9,5,12130,20,0,4,10


### Preprocess wrestler data

In [12]:
# Read wrestler records

banzuke = pd.read_csv("data/banzuke.csv")
banzuke.tail()

Unnamed: 0,basho,id,rank,rikishi,heya,shusshin,birth_date,height,weight,prev,prev_w,prev_l
174180,2022.09,12738,Jk17e,Okuyama,Hakkaku,Hokkaido,2003-06-03,179.0,136.0,Mz,2.0,1.0
174181,2022.09,12777,Jk17w,Sachinofuji,Isegahama,Saitama,2006-03-06,168.0,79.0,Mz,1.0,2.0
174182,2022.09,677,Jk18e,Itakozakura,Shikihide,Ibaraki,1978-10-02,178.0,139.0,Jk2e,0.0,2.0
174183,2022.09,2905,Jk18w,Higohikari,Kise,Kumamoto,1988-01-27,172.0,113.0,Jk8w,0.0,7.0
174184,2022.09,12546,Jk19e,Hamasaki,Otake,Fukuoka,2000-09-30,177.0,135.0,Mz,0.0,3.0


In [13]:
# Get several ids of famous sumo wrestlers

hakuho_id = banzuke[banzuke.rikishi == "Hakuho"].id.iloc[0]
print("Hakuho", hakuho_id)

kotooshu_id = banzuke[banzuke.rikishi == "Kotooshu"].id.iloc[0]
print("Kotooshu", kotooshu_id)

enho_id = banzuke[banzuke.rikishi == "Enho"].id.iloc[0]
print("Enho", enho_id)

Hakuho 1123
Kotooshu 2830
Enho 12412


In [14]:
# Filter out Makuuchi division

banzuke["rank"] = banzuke["rank"].str.extract(makuuchi_regex, expand=True)
banzuke = banzuke.dropna(subset=["rank"])
banzuke = banzuke.reset_index(drop=True)

In [15]:
# Extract year and month

year_and_month = get_year_and_month_from_date(banzuke.basho)
banzuke = pd.concat([banzuke, year_and_month], axis=1)

In [16]:
# Add column `age`

banzuke.basho = pd.to_datetime(banzuke.basho.astype(str), format="%Y.%m")
banzuke.birth_date = pd.to_datetime(banzuke.birth_date)
banzuke["age"] = banzuke.basho - banzuke.birth_date
# convert to years
banzuke["age"] = banzuke.age / np.timedelta64(1, "Y")

In [17]:
# Add column body mass index

banzuke["bmi"] = 1.3 * banzuke.weight / np.power(banzuke.height / 100, 2.5)

In [18]:
# Drop and re-order columns

banzuke = banzuke.drop(columns=[
    "basho",
    "birth_date", 
    "rank",
    "rikishi",
    "heya", 
    "shusshin", 
    "prev",
])
banzuke.columns = [
    'id',
    'height',
    'weight',
    'previous_tournament_wins',
    'previous_tournament_losses',
    'year',
    'month',
    'age',
    'bmi',
]
banzuke = banzuke[[
    'year',
    'month',
    'id',
    'height',
    'weight',
    'bmi',
    'age',
    'previous_tournament_wins',
    'previous_tournament_losses',
]]

In [19]:
# Fix data types

banzuke["id"] = banzuke["id"].astype(int)
banzuke.previous_tournament_wins = banzuke.previous_tournament_wins.astype(int)
banzuke.previous_tournament_losses = banzuke.previous_tournament_losses.astype(int)

In [20]:
# Print the clean wrestler data
banzuke.tail()

Unnamed: 0,year,month,id,height,weight,bmi,age,previous_tournament_wins,previous_tournament_losses
9581,2022,9,12292,185.0,173.0,48.312569,28.942415,8,7
9582,2022,9,11868,169.0,114.0,39.914569,27.622744,6,9
9583,2022,9,12113,182.0,191.0,55.564607,31.099886,5,8
9584,2022,9,12406,187.0,183.0,49.749695,28.353765,9,6
9585,2022,9,12314,177.0,129.0,40.234619,22.365962,10,5


### Merge data

In [21]:
# Merge tournament and wrestler data

# Add wrestler 1 data
sumo = pd.merge(
    left = results,
    right = banzuke.add_suffix("1"),
    how = "left",
    left_on = ["year", "month", "id1"],
    right_on = ["year1", "month1", "id1"],
)
sumo = sumo.drop(columns=["year1", "month1",])

# Add wrestler 2 data
sumo = pd.merge(
    left = sumo,
    right = banzuke.add_suffix("2"),
    how = "left",
    left_on = ["year", "month", "id2"],
    right_on = ["year2", "month2", "id2"],
)
sumo = sumo.drop(columns=["year2", "month2",])

In [22]:
# Reorder columns

wrestler1_columns = sorted(list(sumo.columns[sumo.columns.str.contains("1")]))
wrestler2_columns = sorted(list(sumo.columns[sumo.columns.str.contains("2")]))
other_columns = ['year', 'month', 'day']

sumo = sumo[other_columns + wrestler1_columns + wrestler2_columns]

In [23]:
# Final clean `sumo` data

sumo.tail(2)

Unnamed: 0,year,month,day,age1,bmi1,bout_outcome1,height1,id1,previous_tournament_losses1,previous_tournament_wins1,...,bmi2,bout_outcome2,height2,id2,previous_tournament_losses2,previous_tournament_wins2,rank2,tournament_losses2,tournament_wins2,weight2
133522,2022,9,15,30.823357,48.122448,0,184.0,12130,5,10,...,58.721795,1,175.0,12191,4,11,20,5,9,183.0
133523,2022,9,15,26.073088,58.721795,1,175.0,12191,4,11,...,48.122448,0,184.0,12130,5,10,20,10,4,170.0


In [24]:
# Check for missing data
print(sumo.isna().sum())

year                           0
month                          0
day                            0
age1                           0
bmi1                           0
bout_outcome1                  0
height1                        0
id1                            0
previous_tournament_losses1    0
previous_tournament_wins1      0
rank1                          0
tournament_losses1             0
tournament_wins1               0
weight1                        0
age2                           0
bmi2                           0
bout_outcome2                  0
height2                        0
id2                            0
previous_tournament_losses2    0
previous_tournament_wins2      0
rank2                          0
tournament_losses2             0
tournament_wins2               0
weight2                        0
dtype: int64


In [25]:
# Verify data types
print(sumo.dtypes)

year                             int32
month                            int32
day                              int64
age1                           float64
bmi1                           float64
bout_outcome1                    int64
height1                        float64
id1                              int64
previous_tournament_losses1      int32
previous_tournament_wins1        int32
rank1                            int64
tournament_losses1               int64
tournament_wins1                 int64
weight1                        float64
age2                           float64
bmi2                           float64
bout_outcome2                    int64
height2                        float64
id2                              int64
previous_tournament_losses2      int32
previous_tournament_wins2        int32
rank2                            int64
tournament_losses2               int64
tournament_wins2                 int64
weight2                        float64
dtype: object


## Feature engineering

### Career wins

`career_wins1`
- Общ брой спечелени двубой до момента на конкретната среща.
- Total number of career wins in the top makuuchi division for this rikishi.
- The values are first calculated as the cumulative sum of the `bout_outcomes` for this rikishi.
- Then, they are shifted 1 row down to make them valid as "before the bout" statistics.
- Finally, the NA value generated in the first row after shifting is replaced with 0 as the rikishi had no wins at the start of his career. 

`total_bouts1`
- Общо изиграни двубой в Макуучи.
- Total number of played matches (bouts) by the rikishi in Makuuchi.

`career_win_percentage1`
- Win percentage.
- Процент спечелени двубой по време на цялата кариера.
- The percentage for the first 10 bouts is substituted for their average to accomodate for the low number of matches.

In [26]:
# Calculate win_percentage

dummy_sumo = pd.DataFrame()

for _, group_data in sumo.groupby("id1"):
    group_data["career_wins1"] = group_data.bout_outcome1.cumsum()
    group_data.loc[:, "career_wins1"] = group_data.loc[:, "career_wins1"].shift(periods=1).fillna(0)

    group_data["career_bouts1"] = np.arange(0, len(group_data))

    group_data["career_win_percentage1"] = group_data["career_wins1"] / group_data["career_bouts1"]
    first_10_bouts = group_data["career_win_percentage1"].iloc[:10].mean()
    group_data.loc[group_data["career_bouts1"] < 10, "career_win_percentage1"] = first_10_bouts
    
    dummy_sumo = pd.concat([dummy_sumo, group_data], axis=0)

sumo = dummy_sumo.sort_index()

In [27]:
# Tests
print("\n Hakuho id=1123")
print(sumo[sumo.id1 == 1123][["career_wins1", "career_bouts1", "career_win_percentage1"]].tail())

print("\n Kotooshu id=2830")
print(sumo[sumo.id1 == 2830][["career_wins1", "career_bouts1", "career_win_percentage1"]].tail())

print("\n Enho id=12412")
print(sumo[sumo.id1 == 12412][["career_wins1", "career_bouts1", "career_win_percentage1"]].tail())


 Hakuho id=1123
        career_wins1  career_bouts1  career_win_percentage1
129318        1093.0           1296                0.843364
129356        1094.0           1297                0.843485
129394        1095.0           1298                0.843606
129432        1096.0           1299                0.843726
129470        1097.0           1300                0.843846

 Kotooshu id=2830
        career_wins1  career_bouts1  career_win_percentage1
104202         466.0            784                0.594388
104243         466.0            785                0.593631
104291         466.0            786                0.592875
104331         466.0            787                0.592122
104364         466.0            788                0.591371

 Enho id=12412
        career_wins1  career_bouts1  career_win_percentage1
127087          59.0            129                0.457364
127117          59.0            130                0.453846
127155          59.0            131             

### Encounter wins

#### Same-opponents match identifier
First, a new variable containing the encoded identifier of the pairs of wrestlers. It will be used to group the data for each pair of opponents.

`encounter_id`
- Unique identifier for each wrestler-opponent couple.
- The couples are counted twice - once from the prospective of each wrestler.

`encounter_wins`
- The weighted average of the last 4 fights vs the same opponent.

In [28]:
encounters = sumo.id1.astype(str) + ", " + sumo.id2.astype(str)
encounter_ids, _ = pd.factorize(encounters)
sumo["encounter_id1"] = encounter_ids

In [29]:
# Test
assert sumo.encounter_id1.nunique() == encounters.nunique()

In [30]:
# Test - single id for Hakuho (1123) and Kotooshu (2830) matches
assert sumo.loc[(sumo.id1 == 1123) & (sumo.id2 == 2830), "encounter_id1"].nunique() == 1

#### Select weighted moving average function

Moving average = rolling mean

Choosing between several types of weighted moving averages:
- with custom coef, considers only a certain number of previous records finite subset size.
- exponential coef, considers all previous records, pandas-builin method.

Custom one is better as it takes only certain number (4) of the most recent bouts and disregards the older ones. In contrast, EWM uses all bouts.

In [31]:
# Custom weighted moving average function
weights = np.array([0.05, 0.12, 0.25, 0.50])
def weighted_mean(subset):
    w = weights[len(weights) - len(subset):]
    return np.sum(w * subset) / w.sum()

In [32]:
# Comparison
dummy_data = sumo.groupby("encounter_id1").get_group(12131)

bout_outcome1 = dummy_data.bout_outcome1
custom_mean = dummy_data.bout_outcome1.rolling(window=4, min_periods=1, closed="left").apply(weighted_mean)
ewm = dummy_data.bout_outcome1.ewm(span=4).mean().shift(1)

pd.DataFrame({
    "bout_outcome1": bout_outcome1,
    "custom": custom_mean,
    "ewm": ewm,
})

Unnamed: 0,bout_outcome1,custom,ewm
74297,1,,
74535,0,1.0,1.0
75823,1,0.333333,0.375
77375,0,0.712644,0.693878
82117,0,0.326087,0.375
85069,1,0.130435,0.212353
85905,1,0.597826,0.54283
86195,1,0.815217,0.730965
87213,1,0.945652,0.840417
88647,0,1.0,0.9049


The custom moving average gives 1 on the final row (88647). 

#### Generate variables

In [40]:
dummy_sumo = pd.DataFrame()

for _, group_data in sumo.groupby("encounter_id1"):
    group_data["encounter_wins1"] = group_data.bout_outcome1.cumsum().shift(1).fillna(0) 
    group_data["encounter_bouts1"] = np.arange(0, len(group_data))
    group_data["encounter_win_percentage1"] = group_data["encounter_wins1"] / group_data["encounter_bouts1"]
    group_data["encounter_win_percentage1"] = group_data["encounter_win_percentage1"].fillna(0)
    
    group_data["encounter_win_rolling1"] = group_data["bout_outcome1"].ewm(span=4).mean().shift(1)
    group_data["encounter_win_rolling1"] = group_data["encounter_win_rolling1"].fillna(0.5)
    
    dummy_sumo = pd.concat([dummy_sumo, group_data], axis=0)

dummy_sumo = dummy_sumo.sort_index()

In [35]:
# Test with encounter_id=12131
dummy_sumo[dummy_sumo.encounter_id1 == 12131][[
    "bout_outcome1",
    "bout_outcome2",
    "encounter_wins1",
    "encounter_bouts1",
    "encounter_win_percentage1",
    "encounter_win_rolling1",
]]

Unnamed: 0,bout_outcome1,bout_outcome2,encounter_wins1,encounter_bouts1,encounter_win_percentage1,encounter_win_rolling1
74297,1,0,0.0,0,0.0,0.630594
74535,0,1,1.0,1,1.0,1.0
75823,1,0,1.0,2,0.5,0.375
77375,0,1,2.0,3,0.666667,0.693878
82117,0,1,2.0,4,0.5,0.375
85069,1,0,2.0,5,0.4,0.212353
85905,1,0,3.0,6,0.5,0.54283
86195,1,0,4.0,7,0.571429,0.730965
87213,1,0,5.0,8,0.625,0.840417
88647,0,1,6.0,9,0.666667,0.9049


In [41]:
sumo = dummy_sumo

### Tournament

Same idea as above. This time group by wrestler and tournament.

In [42]:
dummy_sumo = pd.DataFrame()

for _, group_data in sumo.groupby(["year", "month", "id1"]):
    group_data["tournament_wins1"] = group_data.bout_outcome1.cumsum().shift(1).fillna(0) 
    group_data["tournament_bouts1"] = np.arange(0, len(group_data))
    group_data["tournament_win_percentage1"] = group_data["tournament_wins1"] / group_data["tournament_bouts1"]
    group_data["tournament_win_percentage1"] = group_data["tournament_win_percentage1"].fillna(0)
    
    group_data["tournament_win_rolling1"] = group_data["bout_outcome1"].ewm(span=4).mean().shift(1)
    group_data["tournament_win_rolling1"] = group_data["tournament_win_rolling1"].fillna(0.5)
    
    dummy_sumo = pd.concat([dummy_sumo, group_data], axis=0)


dummy_sumo = dummy_sumo.sort_index()

In [43]:
# Test: Jan 2010 tournament, Kotooshu id1=2830
dummy_sumo[(dummy_sumo.year == 2010) & (dummy_sumo.month == 1) & (dummy_sumo.id1 == 2830)][[
    "bout_outcome1",
    "bout_outcome2",
    "tournament_wins1",
    "tournament_bouts1",
    "tournament_win_percentage1",
    "tournament_win_rolling1",
]]

Unnamed: 0,bout_outcome1,bout_outcome2,tournament_wins1,tournament_bouts1,tournament_win_percentage1,tournament_win_rolling1
89833,1,0,0.0,0,0.0,0.5
89869,1,0,1.0,1,1.0,1.0
89912,1,0,2.0,2,1.0,1.0
89957,0,1,3.0,3,1.0,1.0
89998,1,0,3.0,4,0.75,0.540441
90033,1,0,4.0,5,0.8,0.739764
90075,1,0,5.0,6,0.833333,0.848953
90117,1,0,6.0,7,0.857143,0.911112
90158,0,1,7.0,8,0.875,0.947274
90196,0,1,7.0,9,0.777778,0.564507


In [44]:
sumo = dummy_sumo

## Model

### Scale