In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
poker_hand = fetch_ucirepo(id=158) 
  
# data (as pandas dataframes) 
X = poker_hand.data.features 
y = poker_hand.data.targets



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Step 4: Define the model pipeline
# List of categorical columns to one-hot encode
suits_columns = ['S1', 'S2', 'S3', 'S4', 'S5']
ranks_columns = ['C1', 'C2', 'C3', 'C4', 'C5']


In [5]:
# Combine suit and rank columns into one list for one-hot encoding
categorical_columns = suits_columns + ranks_columns

In [6]:
# Define a ColumnTransformer to apply OneHotEncoding to the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # Leave the target column as is
)

In [7]:
# Create the full pipeline with preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [9]:
# Step 5: Train the model
model.fit(X_train, y_train.to_numpy().ravel())

In [10]:
# Step 6: Make predictions
y_pred = model.predict(X_test)

In [11]:
# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9183


In [12]:
# Optionally, you can use other metrics like confusion matrix or classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    153717
           1       0.86      0.98      0.92    130369
           2       0.83      0.01      0.02     14545
           3       0.88      0.19      0.31      6524
           4       1.00      0.00      0.01      1203
           5       1.00      0.01      0.01       611
           6       1.00      0.01      0.02       446
           7       0.00      0.00      0.00        81
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         3

    accuracy                           0.92    307503
   macro avg       0.65      0.22      0.23    307503
weighted avg       0.92      0.92      0.89    307503



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Create the full pipeline with preprocessor and model this time using balanced class weight to assign heigher weights to minority classes
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])


In [14]:
# Step 5: Train the model
model.fit(X_train, y_train.to_numpy().ravel())

In [15]:
# Step 6: Make predictions
y_pred = model.predict(X_test)

In [16]:
# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8205


In [17]:
# Optionally, you can use other metrics like confusion matrix or classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.98      0.90    153717
           1       0.81      0.77      0.79    130369
           2       0.81      0.01      0.01     14545
           3       0.78      0.03      0.06      6524
           4       1.00      0.00      0.00      1203
           5       0.99      0.99      0.99       611
           6       1.00      0.00      0.00       446
           7       0.00      0.00      0.00        81
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         3

    accuracy                           0.82    307503
   macro avg       0.62      0.28      0.28    307503
weighted avg       0.82      0.82      0.79    307503



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Create the full pipeline with preprocessor and model this time using balanced class weight to assign heigher weights to minority classes
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])