In [23]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, ParameterGrid
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [24]:
# Example of loading data into a pandas DataFrame (replace with your actual data loading)
df = pd.read_csv('data/tmp/encoded_train.csv')
test_df = pd.read_csv('data/tmp/encoded_test.csv')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Day,Month,Hour,Night,Holiday,Block,lat,long,Category,DayOfWeek-Monday,...,StreetType-LN,StreetType-DR,StreetType-CT,StreetType-WAY,StreetType-RW,StreetType-PZ,Season-Winter,Season-Fall,Season-Spring,Season-Summer
0,29,11,0,1,0,0,0.584478,-1.557336,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,1,0,0
1,1,6,8,0,0,1,0.911468,0.775401,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,0,1
2,27,4,6,0,0,1,-2.045603,0.570183,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,1,0
3,1,4,15,0,0,32,1.510611,-0.150875,ASSAULT,0,...,0,0,0,0,0,0,0,0,1,0
4,25,7,16,0,0,0,0.718501,0.532851,ASSAULT,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
# Define features (X) and target (y)
X = df.drop('Category', axis=1)  # Features: all columns except 'Category'
y = df['Category'] 

x_test = test_df.drop('Category', axis=1) # Features: all columns except 'Category'
y_test = test_df['Category']


In [26]:
# Define the parameter grid
#param_grid = {
#    'n_estimators': [50],
#    'max_depth': [None, 10, 20, 30],
#    'min_samples_split': [2, 5, 10],
#    'min_samples_leaf': [1, 2, 4],
#    'bootstrap': [True, False]
#}

param_grid = {
    'n_estimators': [50],
    'max_depth': [None, 30, 40, 50],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [4, 6, 8],
    'bootstrap': [False]
}

# Create a grid of parameters
param_list = list(ParameterGrid(param_grid))

In [27]:
# Track the best model and its score
best_score = 0
best_params = None
best_model = None

case_nr = 1

# Iterate over each combination of parameters
for params in param_list:
    print("")
    print(f"Case {case_nr} / {len(param_list)}")
    print(f"Testing parameters: {params}")
    model = RandomForestClassifier(random_state=42, n_jobs=8, **params)
    model.fit(X, y)
    
    # Evaluate the model on the test set
    y_pred = model.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    
    # Update the best model if the current one is better
    if score > best_score:
        best_score = score
        best_params = params
        best_model = model
    
    case_nr += 1

print(f"Best parameters found: {best_params}")
print(f"Best accuracy: {best_score}")



Case 1 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}

Case 2 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}

Case 3 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 50}

Case 4 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 6, 'min_samples_split': 5, 'n_estimators': 50}

Case 5 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 50}

Case 6 / 24
Testing parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 8, 'min_samples_split': 5, 'n_estimators': 50}

Case 7 / 24
Testing parameters: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}

Case 8 / 24
Testing p

In [28]:
# Evaluate the best model
print("Classification report for the best model:")
print(classification_report(y_test, y_pred))


Classification report for the best model:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       301
                    ASSAULT       0.22      0.21      0.22     15310
                   BURGLARY       0.20      0.06      0.09      7254
         DISORDERLY CONDUCT       0.03      0.00      0.01       805
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       446
              DRUG/NARCOTIC       0.32      0.41      0.36     10066
                DRUNKENNESS       0.00      0.00      0.00       814
               EMBEZZLEMENT       0.00      0.00      0.00       213
     FORGERY/COUNTERFEITING       0.25      0.03      0.05      1965
                      FRAUD       0.20      0.01      0.01      3363
                 KIDNAPPING       0.00      0.00      0.00       469
              LARCENY/THEFT       0.33      0.73      0.45     36758
                LIQUOR LAWS       0.50      0.01      0.02       373
                  LOITERING      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
