In [74]:
import pandas as pd
import altair
from vega_datasets import data

car = data.cars()
df = pd.DataFrame(car)
df.head(5)

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


In [79]:
altair.Chart(df).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color='Origin'
)


In [7]:
from MonsterLab import Monster

In [8]:
def mons_df(num):
    doc_list = []
    for _ in range(num):
        mon = Monster()
        dict = {"Name": mon.name,
                "Type": mon.type,
                "Level": mon.level,
                "Rarity": mon.rarity,
                "Damage": mon.damage,
                "Health": mon.health,
                "Energy": mon.energy,
                "Sanity": mon.sanity,
                "Timestamp": mon.timestamp}
        doc_list.append(dict)
    return pd.DataFrame(doc_list)

In [6]:
mf = mons_df()
mf.head(5)

Unnamed: 0,Name,Type,Level,Rarity,Damage,Health,Energy,Sanity,Timestamp
0,Imp,Demonic,6,Rank 3,6d8,46.31,48.11,48.49,2024-10-03 19:46:52
1,Dust Mephit,Elemental,11,Rank 3,11d8+3,86.61,88.74,91.67,2024-10-03 19:46:52
2,Ghoul,Undead,9,Rank 0,9d2+1,17.44,17.69,18.94,2024-10-03 19:46:52
3,Lightning Spirit,Fey,9,Rank 2,9d6+3,54.74,55.65,54.52,2024-10-03 19:46:52
4,Ice Mephit,Elemental,3,Rank 1,3d4,11.73,12.57,11.01,2024-10-03 19:46:52


In [17]:
chart = altair.Chart(mf, title='title').mark_point().encode(
    x='Health',
    y='Energy',
    color='Rarity',
    tooltip=['Name', 'Type', 'Level', 'Rarity', 'Damage', 'Health', 'Energy', 'Sanity', 'Timestamp']
)
chart

In [18]:
chart.to_json

<bound method TopLevelMixin.to_json of alt.Chart(...)>

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

In [21]:
cols = ["Level", "Health", "Energy", "Sanity", "Rarity"]

df = mons_df(1000)[cols]
val = mons_df(200)[cols]
test = mons_df(200)[cols]

target = 'Rarity'
X = df.drop(columns=target)
y = [int(x.split()[1]) for x in df[target]]

df.head()

Unnamed: 0,Level,Health,Energy,Sanity,Rarity
0,6,11.77,12.34,11.54,Rank 0
1,4,8.18,7.35,7.03,Rank 0
2,20,39.38,39.38,39.88,Rank 0
3,12,72.67,73.99,72.29,Rank 2
4,11,63.02,67.39,64.52,Rank 2


In [22]:
X_val = val.drop(columns=target)
y_val = [int(x.split()[1]) for x in val[target]]

X_test = test.drop(columns= target)
y_test = [int(x.split()[1]) for x in test[target]]

In [25]:
baseline = DummyClassifier()
baseline.fit(X,y)
baseline.score(X_val, y_val)

0.285

In [26]:
logr = LogisticRegression(multi_class='ovr').fit(X, y)
logr.score(X_val, y_val)

0.75

In [68]:
clf1 = RandomForestClassifier(random_state=42)
clf1.fit(X, y)
clf1.score(X_val, y_val)

0.935

In [40]:
clf1.score(X_test, y_test)

0.985

In [60]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

In [87]:
param_grid = {'n_estimators': [75,100,125],
              'max_depth': [None, 100, 80],}

In [88]:
hrs = HalvingRandomSearchCV(
    clf1,
    param_distributions=param_grid
)


In [89]:
hrs.fit(X, y)



In [90]:
hrs.best_score_

0.924074074074074

In [92]:
bestest = hrs.best_estimator_
bestest.score(X_val, y_val)

0.935

In [93]:
bestest.score(X_test, y_test)

0.975

In [94]:
hrs.best_params_

{'n_estimators': 125, 'max_depth': 80}

    For a baseline I used Scikit-Learn's dummy module.
Interestingly, I got a slightly better score out of the dummy
classifier than the 1/6 I expected. That probably means the
classes are not evenly distributed. 
    The next model I tried was a simple logistic regression. It
gave a score of 0.75 which improved on the baseline but ther is room for improvement. So I tried sklearn's Random Forest Classifier. Out of the box it scored 0.935. I wanted to try the experimental Halving Random Search Cross Validation class. It ran fairly quickly but it's best estimator didn't score any better than the Random Forrest. Therefore, for the sake of simplicity, I used the Random Forest Classifier with only the random_state argument specified.

In [57]:
sample = [{"Level": 9,
          "Health": 59.0,
          "Energy": 67.0,
          "Sanity": 37.0
}]
feature_basis = pd.DataFrame(sample)

pred = clf1.predict(feature_basis)
pred

array([2])

In [69]:
proba = clf1.predict_proba(pd.DataFrame(sample))[0].max()
f'{proba:.2%}'

'45.00%'

In [65]:
import time
t = time.time()


1728368784.143855