In [192]:
import pandas as pd
from data import MongoDB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder


In [380]:
db = MongoDB("Collection")
df = db.dataframe()
df.head()

Unnamed: 0,Name,Type,Level,Rarity,Damage,Health,Energy,Sanity,Timestamp
0,Pit Fiend,Demonic,5,Rank 4,5d10,47.68,47.82,51.38,2024-02-22 21:16:56
1,Ice Mephit,Elemental,3,Rank 3,3d8,27.25,21.9,25.28,2024-02-22 21:16:56
2,Pit Lord,Devilkin,11,Rank 1,11d4+1,42.69,43.93,43.73,2024-02-22 21:16:56
3,Ghostly Archer,Undead,3,Rank 0,3d2,5.18,5.16,6.46,2024-02-22 21:16:56
4,Gold Faerie,Fey,7,Rank 0,7d2+4,13.08,14.89,14.8,2024-02-22 21:16:56


In [382]:
# Drop high cardinality categorical columns
df.drop(['Name', 'Damage', 'Timestamp'], axis=1, inplace=True)
df.head()

KeyError: "['Name', 'Damage', 'Timestamp'] not found in axis"

In [383]:
# Define target variables and features
y = df['Rarity'].str.extract('(\d+)')
oneD_y = y.squeeze().astype(int)
X = df.drop(columns=['Rarity'])


In [384]:
# One-hot encoding
encoded_data = pd.get_dummies(df['Type'], dtype=int)

# Concatenate the original DataFrame with the encoded columns
data_encoded = pd.concat([df, encoded_data], axis=1)

In [385]:
data_encoded.drop(['Type', 'Rarity'], axis=1, inplace=True)

In [386]:
data_encoded.head()

Unnamed: 0,Level,Health,Energy,Sanity,Demonic,Devilkin,Dragon,Elemental,Fey,Undead
0,5,47.68,47.82,51.38,1,0,0,0,0,0
1,3,27.25,21.9,25.28,0,0,0,1,0,0
2,11,42.69,43.93,43.73,0,1,0,0,0,0
3,3,5.18,5.16,6.46,0,0,0,0,0,1
4,7,13.08,14.89,14.8,0,0,0,0,1,0


In [371]:
print(data_encoded.shape)
print(oneD_y.shape)

(2500, 10)
(2500,)


In [387]:
X_train, X_val, y_train, y_val = train_test_split(data_encoded, oneD_y, test_size=0.2, random_state=42)

In [373]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(2000, 10)
(500, 10)
(2000,)
(500,)


In [374]:
X_train.head()

Unnamed: 0,Level,Health,Energy,Sanity,Demonic,Devilkin,Dragon,Elemental,Fey,Undead
2055,5,27.8,28.48,30.39,1,0,0,0,0,0
1961,12,23.34,23.44,24.85,0,0,1,0,0,0
1864,10,59.04,61.35,57.2,1,0,0,0,0,0
2326,8,50.57,49.48,46.94,0,0,0,0,1,0
461,1,5.71,7.98,7.8,0,0,0,1,0,0


In [375]:
print(X_val.head())
#print(y_val.head())

      Level  Health  Energy  Sanity  Demonic  Devilkin  Dragon  Elemental  \
1447      4   34.10   32.27   28.75        0         0       0          1   
1114     10   20.66   20.12   20.31        0         0       1          0   
1064      7   13.23   14.36   13.44        1         0       0          0   
2287      7   14.38   14.44   14.48        0         0       0          0   
1537      6   48.24   48.88   46.74        0         1       0          0   

      Fey  Undead  
1447    0       0  
1114    0       0  
1064    0       0  
2287    0       1  
1537    0       0  


In [376]:
model_rf = make_pipeline(
    RandomForestClassifier()
)
model_rf.fit(X_train, y_train)

print('Forest: Training Accuracy:', model_rf.score(X_train, y_train))
print('Forest: Validation Accuracy', model_rf.score(X_val, y_val))

Forest: Training Accuracy: 1.0
Forest: Validation Accuracy 0.954


In [377]:
from sklearn.preprocessing import OrdinalEncoder
baseline = RandomForestClassifier()
baseline.fit(X_train, y_train)
model_rf = RandomForestClassifier(random_state=42,
                                  n_jobs=-1,
                                  max_depth=50,
                                  max_samples=0.42,
                                  n_estimators=100)
model_rf.fit(X_train, y_train)

print('Forest: Training Accuracy:', baseline.score(X_train, y_train))
print('Forest: Validation Accuracy', baseline.score(X_val, y_val))

print('Forest: Training Accuracy:', model_rf.score(X_train, y_train))
print('Forest: Validation Accuracy', model_rf.score(X_val, y_val))

Forest: Training Accuracy: 1.0
Forest: Validation Accuracy 0.962
Forest: Training Accuracy: 0.9985
Forest: Validation Accuracy 0.94


In [378]:
from sklearn.ensemble import AdaBoostClassifier

ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=2, random_state=42, )
ada_classifier.fit(X_train,y_train)


print('Training Accuracy: Adaboost', ada_classifier.score(X_train, y_train))
print('Validation Accuracy: Adaboost', ada_classifier.score(X_val, y_val))

Training Accuracy: Adaboost 0.443
Validation Accuracy: Adaboost 0.428


In [389]:
from xgboost import XGBClassifier

xg_classifier = XGBClassifier(n_estimators=10, max_depth=5, random_state=42, eval_metric='merror')

xg_classifier.fit(X_train,y_train)

print('Training Accuracy: xgboost', xg_classifier.score(X_train, y_train))
print('Validation Accuracy: xgboost', xg_classifier.score(X_val, y_val))

Training Accuracy: xgboost 0.9805
Validation Accuracy: xgboost 0.928


In [None]:
"""
3 seperate models were trained on the data pulled from the mongoDB Collection,
AdaBoostClassifier, Random Forest Classifier, and XGBClassifier

I tuned the hyperparameters in all 3 models manually by testing each model 10 times
then recorded the accuracy and compared them to each other. 
The adaboost was the lowest scoring of the three at 42% the XGBClassifier at 92%
and the best model was the baseline Random Forest model with a 97% the data was split
with a train 80% and validation set 20%. 
All scores were from the validation set of the split data.
"""