In [19]:
# Imports
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the MONGO_URL environment variable
mongo_url = os.getenv('MONGO_URL')

# Database setup: Connect to MongoDB and select the database and collection
client = MongoClient(mongo_url)
db = client['Database']
collection = db['Database']

# Data preparation: Fetch data from MongoDB and convert to DataFrame
data = list(collection.find())
df = pd.DataFrame(data)

# Drop MongoDB's automatic '_id' column if it's present in the DataFrame
if '_id' in df.columns:
    df.drop(columns=['_id'], inplace=True)

# Initial Data Exploration: Identify categorical columns
print("Categorical columns:")
print(df.select_dtypes(include=['object']).columns)

# Preprocessing: Encode categorical variables if needed
df_encoded = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns.tolist())

# Define the target and features for modeling
rarity_columns = df_encoded.filter(regex='^Rarity_').columns
X = df_encoded.drop(columns=rarity_columns)
y = df_encoded[rarity_columns].idxmax(axis=1)

# Split the data into training and testing sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training and evaluation: Setup dictionary of models and parameters for tuning
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=200),
    'GradientBoosting': GradientBoostingClassifier()
}

params = {
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10]},
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'GradientBoosting': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
}

# Initialize a dictionary to store the best models
best_models = {}

# Train each model using GridSearchCV
for name, model in models.items():
    grid = GridSearchCV(model, params[name], cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} Best Score: {grid.best_score_:.4f}")

# Select and report the best model based on the test accuracy
best_model_name = max(best_models, key=lambda name: best_models[name].score(X_test, y_test))
best_model = best_models[best_model_name]
best_accuracy = best_model.score(X_test, y_test)

print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_accuracy:.4f}")

# Save the best model to disk using joblib
joblib.dump(best_model, 'best_model.joblib')

# Load the model from disk to confirm it can be loaded successfully
loaded_model = joblib.load('best_model.joblib')
assert loaded_model.score(X_test, y_test) == best_accuracy

# Generate and display a classification report
predictions = best_model.predict(X_test)
print(classification_report(y_test, predictions))

# Summary of the best model
"""
The best model identified through our training and tuning process is 
the Logistic Regression model. It achieved the highest 
cross-validated score of 0.9883 during training 
and an impressive accuracy of 0.9900 on the test set, 
outperforming both RandomForest and GradientBoosting classifiers. 
Logistic Regression, a simple yet powerful linear model, 
provided excellent precision, recall, and F1 scores across most classes. 
Its high accuracy and reliability make it the optimal choice 
for predicting the 'Rarity' of instances based on their features, 
ensuring consistent and dependable results.
"""


Categorical columns:
Index(['Name', 'Type', 'Rarity', 'Damage', 'Timestamp'], dtype='object')
RandomForest Best Score: 0.5642


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression Best Score: 0.9883
GradientBoosting Best Score: 0.8942
Best Model: LogisticRegression
Accuracy: 0.9900
               precision    recall  f1-score   support

Rarity_Rank 0       1.00      1.00      1.00        86
Rarity_Rank 1       1.00      1.00      1.00        73
Rarity_Rank 2       1.00      1.00      1.00        50
Rarity_Rank 3       0.98      1.00      0.99        42
Rarity_Rank 4       0.95      0.97      0.96        39
Rarity_Rank 5       1.00      0.80      0.89        10

     accuracy                           0.99       300
    macro avg       0.99      0.96      0.97       300
 weighted avg       0.99      0.99      0.99       300


The best model identified through our training and tuning process is the Logistic Regression model. It achieved the highest cross-validated score of 0.9883 during training and an impressive accuracy of 0.9900 on the test set, outperforming both RandomForest and GradientBoosting classifiers. Logistic Regression, a simple ye