In [7]:
!pip install lazypredict scikit-learn

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.3.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.2 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.3.2-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.2 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0->lazypredict)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflo

In [8]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

test_data = pd.read_csv("/content/test_data.csv")
train_data = pd.read_csv("/content/train_data.csv")

# 1. Define our features and target
target = 'Result'
categorical_features = ['Season', 'Opponent', 'Home or Away']
numerical_features = ['Opponent(Rating)']

# 2. Separate features and target for both datasets
X_train = train_data[categorical_features + numerical_features]
y_train_labels = train_data[target]

X_test = test_data[categorical_features + numerical_features]
y_test_labels = test_data[target]

# 3. Encode the Target (y)
# We convert W, L, D into numbers (e.g., 2, 0, 1)
le = LabelEncoder()
y_train = le.fit_transform(y_train_labels)
y_test = le.transform(y_test_labels)

# 4. Create a ColumnTransformer to process features (X)
# 'OrdinalEncoder' will convert each category (like 'Real Madrid') into a number.
# We set 'handle_unknown' to assign a new number (-1) just in case the test data
# has an opponent the training data never saw.
# 'passthrough' means the numerical feature ('Opponent(Rating)') is left unchanged.

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
        ('num', 'passthrough', numerical_features)
    ],
    remainder='drop' # Drop any columns we didn't specify
)

# 5. Apply the preprocessing
# We 'fit' on the training data and 'transform' both.
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Data preprocessing complete.")
print(f"Original training features shape: {X_train.shape}")
print(f"Processed training features shape: {X_train_processed.shape}")

Data preprocessing complete.
Original training features shape: (100, 4)
Processed training features shape: (100, 4)


In [9]:
from lazypredict.Supervised import LazyClassifier

# Initialize LazyClassifier
# We ignore warnings for cleaner output and use a random_state for reproducible results.
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=True,
                     random_state=123)

# Fit all models!
# This single command trains and evaluates all models.
models, predictions = clf.fit(X_train_processed, X_test_processed, y_train, y_test)

# Print the model leaderboard
print("\n--- Model Performance on 2024-25 Test Data ---")
print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 4
[LightGBM] [Info] Start training from score -1.966113
[LightGBM] [Info] Start training from score -1.514128
[LightGBM] [Info] Start training from score -0.446287

--- Model Performance on 2024-25 Test Data ---
                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreeClassifier                0.59               0.49    None      0.61   
BaggingClassifier                  0.63               0.45    None      0.63   
RandomForestClassifier             0.63               0.45    None      0.63   
KNeighborsClassifier               0.53   