In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
import scipy
import scipy.stats as stats

In [2]:
from utils.variables import pack, unpack
from utils.display import table, tabulate
from utils.files import download_sheet, save, load
from utils.hash import digest

from game_model import game
from rp_model import compute_rp, make_precomputed_columns
from initial_guess import make_initial_guess


In [3]:
# stuff for display

from IPython.lib.pretty import pretty, pprint

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 120)

In [4]:
# Load previous data

data = pd.read_pickle('./data/rp-data.pickle')
data.describe()
data.head()

Unnamed: 0,Level,RP,MS lvl,ModelRP,Difference,Freq1,FreqL,Inv,Berry1,BerryL,Ing1P,Helps per hour,Helps Neutral,NrgMult,IngrMult,SkillMult,Ingr%,SklContr,BerryD,IngD,Dupes,Amnt,Ing2P,Help skill bonus,RP Multiplier
count,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0,5735.0
mean,10.054577,585.904446,1.445336,584.709459,0.020009,3970.305144,3826.666908,12.123627,28.033653,37.155013,108.791805,1.010856,0.98857,0.994532,1.001883,1.00551,0.207906,31.745737,54.02354,147.813601,1.0,0.047602,1.873583,0.988221,1.02694
std,6.839047,371.356752,0.961285,369.419134,0.941834,947.560612,1011.749091,3.356021,2.614471,7.556406,15.702953,0.29962,0.059088,0.045185,0.118457,0.118344,0.044695,35.855732,26.374069,58.38261,0.0,0.420109,16.313341,0.036752,0.0685
min,1.0,245.0,1.0,-1.0,-13.71,2200.0,1489.1184,7.0,24.0,24.0,90.0,0.52,0.89,0.92,0.8,0.8,0.0,0.0,25.0,90.0,1.0,0.0,0.0,0.79,1.0
25%,5.0,352.0,1.0,351.455,-0.28,3300.0,2992.68,10.0,26.0,32.0,98.0,0.8,1.0,1.0,1.0,1.0,0.18,12.83,35.0,101.0,1.0,0.0,0.0,1.0,1.0
50%,9.0,448.0,1.0,447.83,-0.02,4000.0,3819.2,11.0,28.0,36.0,103.0,0.94,1.0,1.0,1.0,1.0,0.2,19.24,44.0,121.0,1.0,0.0,0.0,1.0,1.0
75%,13.0,692.0,1.0,690.49,0.27,4500.0,4455.0,14.0,30.0,41.0,115.0,1.2,1.0,1.0,1.0,1.0,0.235,43.125,66.0,202.0,1.0,0.0,0.0,1.0,1.0
max,42.0,3440.0,6.0,3440.33,8.91,6300.0,6916.14,23.0,35.0,74.0,151.0,2.41,1.12,1.08,1.2,1.2,0.42,413.8,213.0,511.0,1.0,8.0,342.0,1.0,1.44


Unnamed: 0,Pokemon,Level,RP,Nature,MS lvl,Source,ModelRP,Difference,Nature2,ID,NatureP,NatureN,Freq1,FreqL,Type,Class,MSkill,Inv,Berry1,BerryL,Ing1,Ing1P,Helps per hour,Helps Neutral,NrgMult,IngrMult,SkillMult,Ingr%,SklContr,RPneutral,IDNeutral,BerryD,IngD,Dupes,DupeMatch,Sub Skill 1,Sub Skill 2,Ingredient 2,Amnt,Ing2P,Help skill bonus,RP Multiplier
0,Arbok,8.0,531.0,Hardy,1.0,Questions help guides,530.72,-0.28,Neutral,ArbokNeutral81,-,-,3700.0,3648.2,Poison,Berries,Charge Energy S,14.0,32.0,39,Bean Sausage,103.0,0.98,1.0,1.0,1.0,1.0,0.26,22.77,531.0,ArbokNeutral81,78.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
1,Arbok,9.0,574.0,Naughty,1.0,Rate My Mon,574.43,0.43,Naughty,ArbokNaughty91,Speed of Help,Main Skill Chance,3700.0,3276.72,Poison,Berries,Charge Energy S,14.0,32.0,40,Bean Sausage,103.0,1.09,0.9,1.0,1.0,0.8,0.26,22.77,538.37,ArbokNeutral91,80.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
2,Arbok,8.0,538.0,Quiet,1.0,Questions help guides,538.1,0.1,Quiet,ArbokQuiet81,Ingredient Finding,Exp Gains,3700.0,3648.2,Poison,Berries,Charge Energy S,14.0,32.0,39,Bean Sausage,103.0,0.98,1.0,1.0,1.2,1.0,0.26,22.77,530.61,ArbokNeutral81,78.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
3,Arcanine,5.0,958.0,Calm,3.0,,957.99,-0.01,Calm,ArcanineCalm53,Main Skill Chance,Speed of Help,2500.0,2728.0,Fire,Skills,Extra Helpful S,16.0,27.0,31,Fiery Herb,130.0,1.31,1.11,1.0,1.0,1.2,0.14,84.43,937.95,ArcanineNeutral53,31.0,130.0,1,True,,,,0.0,0.0,1.0,1.0
4,Arcanine,3.0,715.0,Lax,2.0,pokemon sleep general,715.12,0.12,Lax,ArcanineLax32,Energy Recovery,Main Skill Chance,2500.0,2490.0,Fire,Skills,Extra Helpful S,16.0,27.0,29,Fiery Herb,130.0,1.44,1.0,1.08,1.0,0.8,0.14,61.15,750.07,ArcanineNeutral32,29.0,130.0,1,True,,,,0.0,0.0,1.0,1.0


In [5]:
# Load previous fit

x0, unpack_info = pack(make_initial_guess())
hash_value = digest(data, x0)
filename = f"./results/least-squares-fit-{hash_value}.npy"
opt = load(filename)
sol = unpack(opt.x, unpack_info)

opt

    message: `ftol` termination condition is satisfied.
    success: True
     status: 2
          x: [ 2.562e-01  2.545e-01 ...  2.195e-01  9.953e-01]
       cost: 970.1076591930191
       grad: [-1.729e-05  1.028e-04 ...  6.970e-05 -7.303e-04]
 optimality: 1.1403826778405346
       nfev: 50
       njev: 35

In [6]:
# Helpers

def truncated_normal_sample(size, mu, sigma, lower, upper):
    return stats.truncnorm.rvs( (lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma, size=size)

def round_noise(size):
    eps = 1e-6
    return truncated_normal_sample(size, mu=0.0, sigma=0.25, lower= -0.5+eps, upper= 0.5-eps)

def harmonic(a,b):
    return 2.0 / ( 1.0 / a + 1.0 / b )

In [7]:
# In the boostrap method,
# We create N synthetic datasets that are likely to represent the population
# Then we redo the fit on those, starting from the best fit in initial dataset.
# Finally, we collect statistics on the fitted variables between runs

# To create the synthetic datasets, we use sampling with replacement (allow duplicate)
# Because some Pokémon are rare, we use stratified sampling

stratas = data.groupby(['Pokemon'], group_keys=False)
target_group_size =  stratas.size().quantile(q=0.75)
    

n_boostraps = 50
all_opt_x = [opt.x]

In [8]:
# WARNING: RUNNING THE OPTIMISATION N TIMES IS VERY LONG.
#
# YOU CAN INTERRUPT THE KERNEL (STOP BUTTON) 
# AND RUN THE NEXT CELLS IF YOU WANT TO SEE CURRENT PROGRESS
#
# THE COLLECTION `all_opt_x` IS DEFINED ABOVE THIS CELL
# SO YOU CAN RESUME ADDING MORE RUNS

while( len(all_opt_x) < n_boostraps ):

    print( "Boostrap run " + str(len(all_opt_x)) )
    
    # To create the synthetic datasets, we use sampling with replacement (allow duplicate)
    
    # We'll introduce a correction toward more equal sample size between Pokémon
    # We can motivate that by the fact stratified sampling should use population proportions 
    # instead of current data proportions
    
    resampled = stratas.apply(lambda x: x.sample( round( harmonic(len(x),target_group_size) ), replace=True, ignore_index=True))

    # Add small noise to RP
    # We do so to simulate some unknown value that would round() to current RP

    referenceRP = resampled["RP"].to_numpy()
    referenceRP += round_noise(len(resampled))
    
    # Compute per sample information about help time, nature, sub-skills etc
    recomputed = make_precomputed_columns(resampled)

    # Put the pieces together
    
    def residual(x):
        return referenceRP - compute_rp(x, resampled, recomputed, unpack_info)
    
    # Redo the fit, starting from the optimal we found on current data
    opt2 = scipy.optimize.least_squares(residual, opt.x, loss="soft_l1", xtol=None, verbose=2) 
    
    # Collect results for stats
    all_opt_x.append(opt2.x)


Boostrap run 1
   Iteration     Total nfev        Cost      Cost reduction    Step norm     Optimality   
       0              1         1.1292e+03                                    4.33e+06    
       1              2         1.1144e+03      1.49e+01       4.98e+00       1.64e+06    
       2              7         1.0967e+03      1.77e+01       4.86e-03       9.51e+04    
       3              8         1.0949e+03      1.82e+00       9.72e-03       1.85e+04    
       4              9         1.0943e+03      6.04e-01       1.94e-02       4.87e+03    
       5             10         1.0937e+03      5.45e-01       3.89e-02       6.12e+01    
       6             11         1.0929e+03      7.84e-01       7.78e-02       9.20e+02    
       7             12         1.0915e+03      1.40e+00       1.56e-01       1.30e+03    
       8             13         1.0892e+03      2.37e+00       3.11e-01       4.35e+03    
       9             14         1.0855e+03      3.66e+00       6.22e-01    

  z = (f / f_scale) ** 2


       1              6         1.0374e+03      4.56e+01       1.32e+00       2.46e+06    
       2              8         1.0361e+03      1.32e+00       6.59e-01       2.86e+04    
       3             10         1.0361e+03      2.55e-02       3.30e-01       1.34e+03    
       4             12         1.0361e+03      4.54e-03       1.65e-01       1.77e+01    
       5             13         1.0361e+03      4.83e-03       3.30e-01       4.22e+02    
       6             16         1.0361e+03      2.12e-04       4.12e-02       1.33e+01    
       7             18         1.0361e+03      7.07e-05       2.06e-02       1.57e+01    
       8             19         1.0361e+03      7.51e-05       4.12e-02       5.66e+01    
       9             22         1.0361e+03      3.29e-06       5.15e-03       2.21e+00    
`ftol` termination condition is satisfied.
Function evaluations 22, initial cost 1.0830e+03, final cost 1.0361e+03, first-order optimality 2.21e+00.
Boostrap run 47
   Iteration    

In [9]:
# Save
boostrap_filename = f"./results/bootstrap-fit-{hash_value}.npy"
save(boostrap_filename, all_opt_x)

# Stats
center = np.mean(all_opt_x, axis=0)
interval = 1.96 * np.std(all_opt_x, axis=0) # 95% confidence region

sol_center = unpack(center, unpack_info)
sol_interval = unpack(interval, unpack_info)

# Pretty display
pd.DataFrame({
    
    "Pokemon":game.pokedex.data["Pokemon"], 
    
    "ing%": sol_center["Pokemons ing fractions"]*100.0,
    "conf (ing%)": sol_interval["Pokemons ing fractions"]*100.0,
    
    "skill% * skillValue": sol_center["Pokemons skill products"], 
    "conf (skill% * skillValue)": sol_interval["Pokemons skill products"], 

}).set_index("Pokemon")

Unnamed: 0_level_0,ing%,conf (ing%),skill% * skillValue,conf (skill% * skillValue)
Pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bulbasaur,25.613301,0.06549,16.736075,0.105389
Ivysaur,25.44917,0.0334,16.697394,0.028053
Venusaur,26.559529,0.047075,18.436675,0.041629
Charmander,20.037733,0.042202,9.676956,0.059279
Charmeleon,22.653238,0.036567,14.052722,0.032424
Charizard,22.381053,0.04241,14.032367,0.070709
Squirtle,26.990739,0.076952,17.631474,0.102146
Wartortle,27.050244,0.034805,17.568334,0.032492
Blastoise,27.430092,0.041192,18.4294,0.0338
Caterpie,17.897961,0.192845,7.009285,0.09686


In [10]:
# Explain what we did with the re-sampling target size

pd.DataFrame({'before': stratas.size(), 'after': stratas.apply(lambda x: round( harmonic(len(x), target_group_size) )) })

Unnamed: 0_level_0,before,after
Pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1
Absol,30,43
Altaria,8,14
Ampharos,12,21
Arbok,19,30
Arcanine,35,48
Bayleef,20,31
Bellsprout,70,72
Blastoise,40,52
Bonsly,44,55
Bulbasaur,155,100
