In [1]:
import numpy as np
import pandas as pd
import scipy
import scipy.stats as stats
import seaborn as sns

sns.set()

In [2]:
# project files
from rp_model.files import set_files_directory

set_files_directory("./files")

from rp_model.calc import (
    game, FitOptions, compute_rp, make_precomputed_columns, make_initial_guess
)
from rp_model.utils import (
    pack, unpack, save, load, digest, isfile, DataStore
)

In [3]:
# stuff for display

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# progress bar
from tqdm.auto import tqdm

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 120)

In [4]:
# Load previous data

data = pd.read_pickle(FitOptions.data_file)
data.describe()
data.head()

Unnamed: 0,Level,RP,MS lvl,ModelRP,Difference,Freq1,FreqL,Inv,Berry1,BerryL,Ing1P,Helps per hour,Helps Neutral,NrgMult,IngrMult,SkillMult,Ingr%,SklContr,BerryD,IngD,Dupes,Amnt,Ing2P,Help skill bonus,RP Multiplier
count,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0,7771.0
mean,11.713422,663.504182,1.581135,663.510616,0.006434,3864.869386,3702.560196,12.517694,28.021876,38.874276,108.521554,1.052698,0.987675,0.994451,0.997529,1.0096,0.207487,36.214361,56.775576,148.531141,1.0,0.108352,4.518852,0.985299,1.03156
std,7.755388,442.393993,1.059953,442.407173,0.577314,976.06816,1042.780251,3.542903,2.59646,8.685571,16.133481,0.3206,0.058436,0.044974,0.118567,0.11866,0.045806,40.553191,28.692011,62.733171,0.0,0.612689,24.710514,0.04089,0.073808
min,1.0,245.0,1.0,245.0,-15.0,2200.0,1489.1184,7.0,24.0,24.0,90.0,0.52,0.89,0.92,0.8,0.8,0.13,5.94,25.0,90.0,1.0,0.0,0.0,0.79,1.0
25%,6.0,364.0,1.0,365.0,0.0,3000.0,2836.2,10.0,26.0,33.0,98.0,0.82,1.0,1.0,1.0,1.0,0.18,14.12,36.0,101.0,1.0,0.0,0.0,1.0,1.0
50%,10.0,509.0,1.0,509.0,0.0,3800.0,3678.4,12.0,28.0,37.0,103.0,0.97,1.0,1.0,1.0,1.0,0.2,20.75,47.0,121.0,1.0,0.0,0.0,1.0,1.0
75%,15.0,784.0,2.0,784.0,0.0,4500.0,4382.78,15.0,30.0,43.0,115.0,1.26,1.0,1.0,1.0,1.0,0.23,48.08,68.0,202.0,1.0,0.0,0.0,1.0,1.0
max,45.0,3493.0,6.0,3495.0,5.0,6300.0,6916.14,24.0,35.0,83.0,151.0,2.41,1.12,1.08,1.2,1.2,0.42,414.12,243.0,511.0,1.0,8.0,342.0,1.0,1.44


Unnamed: 0,Pokemon,Level,RP,Nature,MS lvl,Source,ModelRP,Difference,Nature2,ID,NatureP,NatureN,Freq1,FreqL,Type,Class,MSkill,Inv,Berry1,BerryL,Ing1,Ing1P,Helps per hour,Helps Neutral,NrgMult,IngrMult,SkillMult,Ingr%,SklContr,RPneutral,IDNeutral,BerryD,IngD,Dupes,DupeMatch,Sub Skill 1,Sub Skill 2,Ingredient 2,Amnt,Ing2P,Help skill bonus,RP Multiplier
0,Arbok,9.0,574.0,Naughty,1.0,Rate My Mon,574.0,0.0,Naughty,ArbokNaughty91,Speed of Help,Main Skill Chance,3700.0,3276.72,Poison,Berries,Charge Energy S,14.0,32.0,40,Bean Sausage,103.0,1.09,0.9,1.0,1.0,0.8,0.26,22.75,538.35,ArbokNeutral91,80.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
1,Arbok,8.0,531.0,Hardy,1.0,Questions help guides,531.0,0.0,Neutral,ArbokNeutral81,-,-,3700.0,3648.2,Poison,Berries,Charge Energy S,14.0,32.0,39,Bean Sausage,103.0,0.98,1.0,1.0,1.0,1.0,0.26,22.75,531.0,ArbokNeutral81,78.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
2,Arbok,8.0,538.0,Quiet,1.0,Questions help guides,538.0,0.0,Quiet,ArbokQuiet81,Ingredient Finding,Exp Gains,3700.0,3648.2,Poison,Berries,Charge Energy S,14.0,32.0,39,Bean Sausage,103.0,0.98,1.0,1.0,1.2,1.0,0.26,22.75,530.61,ArbokNeutral81,78.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
3,Arbok,9.0,582.0,Relaxed,1.0,,582.0,0.0,Relaxed,ArbokRelaxed91,Energy Recovery,EXP Gains,3700.0,3640.8,Poison,Berries,Charge Energy S,14.0,32.0,40,Bean Sausage,103.0,0.98,1.0,1.08,1.0,1.0,0.26,22.75,538.89,ArbokNeutral91,80.0,103.0,1,True,,,,0.0,0.0,1.0,1.0
4,Arbok,9.0,489.0,Timid,1.0,,489.0,0.0,Timid,ArbokTimid91,EXP Gains,Speed of Help,3700.0,4004.88,Poison,Berries,Charge Energy S,14.0,32.0,40,Bean Sausage,103.0,0.89,1.1,1.0,1.0,1.0,0.26,22.75,538.45,ArbokNeutral91,80.0,103.0,1,True,,,,0.0,0.0,1.0,1.0


In [5]:
# Load previous fit

x0, unpack_info = pack(*make_initial_guess())

hash_value = digest(data, x0)

store = (DataStore()
         .with_dependency_on(data, x0)
         .try_read_and_validate(FitOptions.result_file)
         )

opt = store.data()
sol = unpack(opt.x, unpack_info)

opt

    message: `ftol` termination condition is satisfied.
    success: True
     status: 2
          x: [ 5.625e-01  5.436e-01 ...  3.747e-01  3.691e-01]
       cost: 1283.8894778888819
       grad: [ 1.364e-05  3.493e-06 ...  0.000e+00  0.000e+00]
 optimality: 0.0070385140425059944
       nfev: 21
       njev: 10

In [6]:
# Helpers

def truncated_normal_sample(size, mu, sigma, lower, upper):
    return stats.truncnorm.rvs((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma, size=size)


def round_noise(size):
    eps = 1e-6
    return truncated_normal_sample(size, mu=0.0, sigma=0.25, lower=-0.5 + eps, upper=0.5 - eps)


def harmonic(a, b):
    return 2.0 / (1.0 / a + 1.0 / b)

In [7]:
# In the boostrap method,
# We create N synthetic datasets that are likely to represent the population
# Then we redo the fit on those, starting from the best fit in initial dataset.
# Finally, we collect statistics on the fitted variables between runs

# To create the synthetic datasets, we use sampling with replacement (allow duplicate)
# Because some Pokémon are rare, we use stratified sampling

stratas = data.groupby(['Pokemon'], group_keys=False)
target_group_size = stratas.size().quantile(q=0.75)

n_bootstraps = 50
all_opt_x = [opt.x]

In [8]:
cache_fit = True

store.try_read_and_validate(FitOptions.boostrap_file)

if cache_fit and store.is_valid():

    all_opt_x = store.data()
    print("Loaded from cache")

else:

    # WARNING: RUNNING THE OPTIMISATION N TIMES IS VERY LONG.
    #
    # YOU CAN INTERRUPT THE KERNEL (STOP BUTTON)
    # AND RUN THE NEXT CELLS IF YOU WANT TO SEE CURRENT PROGRESS
    #
    # THE COLLECTION `all_opt_x` IS DEFINED ABOVE THIS CELL
    # SO YOU CAN RESUME ADDING MORE RUNS

    with tqdm(total=n_bootstraps) as pbar:
        while len(all_opt_x) <= n_bootstraps:
            _ = pbar.update()
            print("Boostrap run " + str(len(all_opt_x)))

            # To create the synthetic datasets, we use sampling with replacement (allow duplicate)

            # We'll introduce a correction toward more equal sample size between Pokémon
            # We can motivate that by the fact stratified sampling should use population proportions
            # instead of current data proportions

            resampled = stratas.apply(
                lambda x: x.sample(round(harmonic(len(x), target_group_size)), replace=True, ignore_index=True))

            # Add small noise to RP
            # We do so to simulate some unknown value that would round() to current RP

            referenceRP = resampled["RP"].to_numpy()
            referenceRP += round_noise(len(resampled))

            # Compute per sample information about help time, nature, subskills etc
            recomputed = make_precomputed_columns(resampled)


            # Put the pieces together

            def residual(x):
                return referenceRP - compute_rp(x, resampled, recomputed, unpack_info)


            #FitOptions.soft_round.exact = False
            #FitOptions.soft_round.alpha = 6

            # Redo the fit, starting from the optimal we found on current data
            opt2 = scipy.optimize.least_squares(residual, opt.x, **FitOptions.least_squares_kwargs)

            # Collect results for stats
            all_opt_x.append(opt2.x)

    # Save
    store.use_data(all_opt_x).save_to(FitOptions.boostrap_file)

  0%|          | 0/50 [00:00<?, ?it/s]

Boostrap run 1
   Iteration     Total nfev        Cost      Cost reduction    Step norm     Optimality   
       0              1         1.6517e+03                                    6.86e+04    
       1              5         1.5585e+03      9.32e+01       1.28e-01       4.07e+02    
       2              8         1.5585e+03      6.05e-02       1.60e-02       3.11e+00    
       3              9         1.5584e+03      6.53e-02       3.19e-02       1.04e+01    
       4             12         1.5584e+03      3.41e-03       3.99e-03       7.18e-01    
       5             13         1.5584e+03      3.64e-03       7.98e-03       8.08e-01    
       6             16         1.5584e+03      1.58e-04       9.97e-04       1.25e-01    
       7             17         1.5584e+03      1.18e-04       1.99e-03       7.86e-02    
       8             23         1.5584e+03      2.99e-09       3.90e-06       1.72e-03    
`ftol` termination condition is satisfied.
Function evaluations 23, initial

In [9]:
# Stats
# We'll use the median as estimate for the mean.
# And 1.4826 * < median absolute deviation from the median > as estimate for std

center = np.median(all_opt_x, axis=0)

interval_std = 1.96 * np.std(all_opt_x, axis=0)  # 95% confidence region
interval_mad = 3 * np.median(list(map(lambda x: np.abs(center - x), all_opt_x)), axis=0)  # 95% confidence region

sol_center = unpack(center, unpack_info)
sol_interval_std = unpack(interval_std, unpack_info)
sol_interval_mad = unpack(interval_mad, unpack_info)

# Pretty display
pd.DataFrame({

    "Pokemon": game.data.pokedex["Pokemon"],

    "ing%": sol_center["Pokemons ing fractions"] * 100.0,
    #"conf (ing)": sol_interval_std["Pokemons ing fractions"]*100.0,
    "conf (ing)*": sol_interval_mad["Pokemons ing fractions"] * 100.0,

    "skill% * skillValue": sol_center["Pokemons skill products"],
    #"conf (skill)": sol_interval_std["Pokemons skill products"],
    "conf (skill)*": sol_interval_mad["Pokemons skill products"],

}).set_index("Pokemon")

Unnamed: 0_level_0,ing%,conf (ing)*,skill% * skillValue,conf (skill)*
Pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bulbasaur,25.631218,20.04096,16.706737,37.55977
Ivysaur,25.438258,20.023825,16.709849,37.524044
Venusaur,26.552188,20.034463,18.46975,37.527648
Charmander,20.033806,20.035988,9.682476,37.560938
Charmeleon,22.672448,20.027998,14.047551,37.535121
Charizard,22.351711,20.040109,14.097506,37.550471
Squirtle,27.00085,20.055232,17.627339,37.58228
Wartortle,27.043241,20.032591,17.585425,37.526159
Blastoise,27.438671,20.03252,18.456103,37.521833
Caterpie,17.932679,20.113226,6.982601,37.543932


In [10]:
# Explain what we did with the re-sampling target size
# pd.DataFrame({'before': stratas.size(), 'after': stratas.apply(lambda x: round( harmonic(len(x), target_group_size) )) })