In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [16]:
# Load the dataset
file_path = './final_data_with_crime_severity.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,index,zipcode,num_felony,num_violation,num_misdemeanor,most_freq_ofns_desc,most_freq_loc_occur,most_freq_prem_type,most_freq_susp_age_group,...,most_freq_vic_age_group,most_freq_vic_race,most_freq_vic_sex,num_total_crimes,median_income,unemployment_rate,num_stopandfrisk,num_police_station,num_restroom,crime_severity
0,0,0,10001.0,2918,734,3677,PETIT LARCENY,INSIDE,STREET,UNKNOWN,...,UNKNOWN,UNKNOWN,D,7329,106509.0,4.3,92,1,8,3
1,1,1,10002.0,2278,1024,4381,PETIT LARCENY,INSIDE,STREET,UNKNOWN,...,UNKNOWN,UNKNOWN,M,7683,43362.0,7.6,109,1,16,3
2,2,2,10003.0,1865,453,3066,PETIT LARCENY,INSIDE,STREET,UNKNOWN,...,UNKNOWN,UNKNOWN,D,5384,152863.0,4.7,82,2,5,2
3,3,3,10004.0,143,70,229,PETIT LARCENY,INSIDE,STREET,UNKNOWN,...,UNKNOWN,UNKNOWN,M,442,232543.0,0.4,7,1,3,1
4,4,4,10005.0,219,76,963,PETIT LARCENY,INSIDE,CHAIN STORE,UNKNOWN,...,UNKNOWN,UNKNOWN,D,1258,189886.0,3.4,7,0,0,1


In [71]:
# Load the dataset
file_path = './final_data_with_crime_severity.csv'
data = pd.read_csv(file_path)

# Drop irrelevant columns (e.g., non-numeric, identifiers, etc.)
irrelevant_cols = ['Unnamed: 0', 'index', 'most_freq_ofns_desc', 
                   'most_freq_loc_occur', 'most_freq_prem_type',
                   'most_freq_susp_age_group', 'most_freq_susp_race', 
                   'most_freq_susp_sex', 'most_freq_vic_age_group', 
                   'most_freq_vic_race', 'most_freq_vic_sex']
data = data.drop(columns=irrelevant_cols, errors='ignore')

# Drop rows with missing values
data = data.dropna()

# Separate features (X) and target variable (y)
X = data.drop(columns=['crime_severity', 'zipcode'])
y = data['crime_severity']

# Ensure the target variable is categorical
y = y.astype('int')  # If not already integer


In [72]:
# Split into train (60%), validation (20%), and test (20%) sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)


In [79]:
# Split into 50% training and 50% temporary
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=42)
# Split the remaining 50% into 25% validation and 25% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50% of temporary = 25%


In [81]:
# Initialize the Random Forest model with default parameters
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
val_pred = rf_model.predict(X_val)

# Evaluate the model using accuracy and classification report
print("Validation Accuracy:", accuracy_score(y_val, val_pred))
print("\nClassification Report on Validation Set:\n", classification_report(y_val, val_pred))

# Make predictions on the test set
test_pred = rf_model.predict(X_test)

# Evaluate the model on the test set
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("\nClassification Report on Test Set:\n", classification_report(y_test, test_pred))


Validation Accuracy: 0.9545454545454546

Classification Report on Validation Set:
               precision    recall  f1-score   support

           1       0.94      1.00      0.97        15
           2       0.94      0.94      0.94        17
           3       1.00      0.92      0.96        12

    accuracy                           0.95        44
   macro avg       0.96      0.95      0.96        44
weighted avg       0.96      0.95      0.95        44

Test Accuracy: 0.9333333333333333

Classification Report on Test Set:
               precision    recall  f1-score   support

           1       1.00      0.88      0.94        17
           2       0.87      1.00      0.93        20
           3       1.00      0.88      0.93         8

    accuracy                           0.93        45
   macro avg       0.96      0.92      0.93        45
weighted avg       0.94      0.93      0.93        45



In [None]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20],
    'max_features': ['sqrt', 'log2', None],
    # 'class_weight': ['balanced', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}


In [83]:
best_params = None
best_score = 0
best_model = None

# Calculate total iterations for progress tracking
total_iterations = (
    len(param_grid['n_estimators']) *
    len(param_grid['max_depth']) *
    len(param_grid['min_samples_leaf']) *
    len(param_grid['max_features']) *
    len(param_grid['class_weight']) *
    len(param_grid['bootstrap']) *
    len(param_grid['criterion'])
)
current_iteration = 0

print(f"Starting hyperparameter tuning: {total_iterations} combinations to evaluate.\n")

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_leaf in param_grid['min_samples_leaf']:
            for max_features in param_grid['max_features']:
                for class_weight in param_grid['class_weight']:
                    for bootstrap in param_grid['bootstrap']:
                        for criterion in param_grid['criterion']:
                            # Increment iteration counter
                            current_iteration += 1

                            # Train the model with the current set of hyperparameters
                            rf_model = RandomForestClassifier(
                                n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_leaf=min_samples_leaf,
                                max_features=max_features,
                                class_weight=class_weight,
                                bootstrap=bootstrap,
                                criterion=criterion,
                                random_state=42
                            )
                            rf_model.fit(X_train, y_train)
                            
                            # Evaluate on the validation set
                            val_pred = rf_model.predict(X_val)
                            val_score = accuracy_score(y_val, val_pred)
                            
                            # Print progress and parameters
                            print(f"Iteration {current_iteration}/{total_iterations}")
                            print(f"Params: n_estimators={n_estimators}, max_depth={max_depth}, "
                                  f"min_samples_leaf={min_samples_leaf}, max_features={max_features}, "
                                  f"class_weight={class_weight}, bootstrap={bootstrap}, "
                                  f"criterion={criterion} | Validation Score: {val_score:.4f}\n")
                            
                            # Update best parameters if current score is better
                            if val_score > best_score:
                                best_score = val_score
                                best_params = {
                                    'n_estimators': n_estimators,
                                    'max_depth': max_depth,
                                    'min_samples_leaf': min_samples_leaf,
                                    'max_features': max_features,
                                    'class_weight': class_weight,
                                    'bootstrap': bootstrap,
                                    'criterion': criterion,
                                }
                                best_model = rf_model

# Output the best parameters and score
print("\nHyperparameter tuning complete.")
print(f"Best Parameters: {best_params}")
print(f"Best Validation Score: {best_score:.4f}")


Starting hyperparameter tuning: 1152 combinations to evaluate.

Iteration 1/1152
Params: n_estimators=50, max_depth=None, min_samples_leaf=5, max_features=sqrt, class_weight=balanced, bootstrap=True, criterion=gini | Validation Score: 0.8864

Iteration 2/1152
Params: n_estimators=50, max_depth=None, min_samples_leaf=5, max_features=sqrt, class_weight=balanced, bootstrap=True, criterion=entropy | Validation Score: 0.8864

Iteration 3/1152
Params: n_estimators=50, max_depth=None, min_samples_leaf=5, max_features=sqrt, class_weight=balanced, bootstrap=False, criterion=gini | Validation Score: 0.9318

Iteration 4/1152
Params: n_estimators=50, max_depth=None, min_samples_leaf=5, max_features=sqrt, class_weight=balanced, bootstrap=False, criterion=entropy | Validation Score: 0.9091

Iteration 5/1152
Params: n_estimators=50, max_depth=None, min_samples_leaf=5, max_features=sqrt, class_weight=None, bootstrap=True, criterion=gini | Validation Score: 0.8864

Iteration 6/1152
Params: n_estimators

In [85]:
# Use the best model on the test set
test_pred = best_model.predict(X_test)
test_score = accuracy_score(y_test, test_pred)

print("Best Hyperparameters:", best_params)
print("Validation Accuracy with Best Model:", best_score)
print("Test Accuracy with Best Model:", test_score)


Best Hyperparameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'class_weight': 'balanced', 'bootstrap': False, 'criterion': 'gini'}
Validation Accuracy with Best Model: 0.9318181818181818
Test Accuracy with Best Model: 0.8444444444444444


In [78]:
# Initialize the model
rf_model = RandomForestClassifier(random_state=42)


In [55]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [56]:
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 5760 candidates, totalling 17280 fits
[CV] END bootstrap=True, class_weight=balanced, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, class_weight=balanced, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, class_weight=balanced, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, class_weight=balanced, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, class_weight=balanced, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END bootstrap=True, class_weight=balanced, c

5760 fits failed out of a total of 17280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4101 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/John/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/John/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/John/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/John/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", line 95, i

In [57]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [58]:
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate the tuned model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8888888888888888

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       0.94      0.85      0.89        20
           3       0.62      1.00      0.77         5

    accuracy                           0.89        36
   macro avg       0.86      0.92      0.87        36
weighted avg       0.92      0.89      0.89        36



In [21]:
# Display feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)


              Feature  Importance
3    num_total_crimes    0.221530
0          num_felony    0.210981
2     num_misdemeanor    0.166957
1       num_violation    0.150647
6    num_stopandfrisk    0.107710
8        num_restroom    0.053355
7  num_police_station    0.031298
4       median_income    0.028775
5   unemployment_rate    0.028746


In [11]:
# Load the dataset
file_path = './final_data_with_crime_severity.csv'
data = pd.read_csv(file_path)

# Select relevant features and target
features = ['num_felony', 'num_violation', 'num_misdemeanor', 'num_total_crimes']
X = data[features]
y = data['crime_severity']

# Drop rows with missing values
data = data.dropna(subset=features + ['crime_severity'])

# Ensure the target variable is categorical
y = y.astype('int')  # If not already an integer


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)


In [14]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7222222222222222

Classification Report:
               precision    recall  f1-score   support

           1       0.73      0.73      0.73        11
           2       0.81      0.65      0.72        20
           3       0.56      1.00      0.71         5

    accuracy                           0.72        36
   macro avg       0.70      0.79      0.72        36
weighted avg       0.75      0.72      0.72        36



In [15]:
# Display feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)


            Feature  Importance
3  num_total_crimes    0.362278
0        num_felony    0.285171
2   num_misdemeanor    0.210757
1     num_violation    0.141794
