In [11]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, ParameterGrid
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
import pandas as pd

In [12]:
# Example of loading data into a pandas DataFrame (replace with your actual data loading)
df = pd.read_csv('../../data/tmp/encoded_train.csv')
test_df = pd.read_csv('../../data/tmp/encoded_test.csv')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Day,Month,Hour,Night,Holiday,Block,lat,long,Category,DayOfWeek-Monday,...,StreetType-LN,StreetType-DR,StreetType-CT,StreetType-WAY,StreetType-RW,StreetType-PZ,Season-Winter,Season-Fall,Season-Spring,Season-Summer
0,29,11,0,1,0,0,0.584478,-1.557336,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,1,0,0
1,1,6,8,0,0,1,0.911468,0.775401,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,0,1
2,27,4,6,0,0,1,-2.045603,0.570183,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,1,0
3,1,4,15,0,0,32,1.510611,-0.150875,ASSAULT,0,...,0,0,0,0,0,0,0,0,1,0
4,25,7,16,0,0,0,0.718501,0.532851,ASSAULT,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
# Define features (X) and target (y)
X = df.drop('Category', axis=1)  # Features: all columns except 'Category'
y = df['Category'] 

x_test = test_df.drop('Category', axis=1) # Features: all columns except 'Category'
y_test = test_df['Category']


In [14]:
param_grid = {
    'max_depth': [None, 15, 25, 30, 50],
    'max_leaf_nodes': [None, 1000, 10000, 100000],
}


# Create a grid of parameters
param_list = list(ParameterGrid(param_grid))

In [15]:
# Track the best model and its score
best_score = 0
best_params = None
best_model = None

case_nr = 1

# Iterate over each combination of parameters
for params in param_list:
    print("")
    print(f"Case {case_nr} / {len(param_list)}")
    print(f"Testing parameters: {params}")
    model = DecisionTreeClassifier(random_state=42, **params)
    model.fit(X, y)
    
    # Evaluate the model on the test set
    y_pred = model.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print(f"Score: {score}")
    
    # Update the best model if the current one is better
    if score > best_score:
        best_score = score
        best_params = params
        best_model = model
    
    case_nr += 1

print(f"Best parameters found: {best_params}")
print(f"Best accuracy: {best_score}")



Case 1 / 20
Testing parameters: {'max_depth': None, 'max_leaf_nodes': None}
Score: 0.23182620579693639

Case 2 / 20
Testing parameters: {'max_depth': None, 'max_leaf_nodes': 1000}
Score: 0.28083252662149083

Case 3 / 20
Testing parameters: {'max_depth': None, 'max_leaf_nodes': 10000}
Score: 0.2823415523033996

Case 4 / 20
Testing parameters: {'max_depth': None, 'max_leaf_nodes': 100000}
Score: 0.25352770343374526

Case 5 / 20
Testing parameters: {'max_depth': 15, 'max_leaf_nodes': None}
Score: 0.2739707305962075

Case 6 / 20
Testing parameters: {'max_depth': 15, 'max_leaf_nodes': 1000}
Score: 0.27959683389328627

Case 7 / 20
Testing parameters: {'max_depth': 15, 'max_leaf_nodes': 10000}
Score: 0.2764193383064746

Case 8 / 20
Testing parameters: {'max_depth': 15, 'max_leaf_nodes': 100000}
Score: 0.2740561471442401

Case 9 / 20
Testing parameters: {'max_depth': 25, 'max_leaf_nodes': None}
Score: 0.23797050281874607

Case 10 / 20
Testing parameters: {'max_depth': 25, 'max_leaf_nodes': 10