# model training random forest



In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load Dataset
# Replace with your dataset file path
pakistan = pd.read_csv("/content/cleaned_dataset.csv")

# Drop Irrelevant Columns
drop_columns = ['ID', 'Case Number', 'FBI Code', 'Block', 'Location', 'Updated On', 'X Coordinate', 'Y Coordinate',
                'Beat', 'Ward', 'Community Area']
pakistan.drop(columns=drop_columns, inplace=True)

# Extract Temporal Features
if 'Date' in pakistan.columns:
    pakistan['Date'] = pd.to_datetime(pakistan['Date'], errors='coerce')
    pakistan['Year'] = pakistan['Date'].dt.year
    pakistan['Month'] = pakistan['Date'].dt.month
    pakistan['Day'] = pakistan['Date'].dt.day
    pakistan['Hour'] = pakistan['Date'].dt.hour
    pakistan['Minute'] = pakistan['Date'].dt.minute
    pakistan.drop(columns=['Date'], inplace=True)

# Handle Missing Values
print("Missing values before handling:")
print(pakistan.isnull().sum())

# Fill numeric columns with mean
numeric_columns = pakistan.select_dtypes(include=['float64', 'int64']).columns
pakistan[numeric_columns] = pakistan[numeric_columns].fillna(pakistan[numeric_columns].mean())

# Fill categorical columns with a placeholder
categorical_columns = pakistan.select_dtypes(include=['object']).columns
pakistan[categorical_columns] = pakistan[categorical_columns].fillna('Unknown')

# Drop rows with any remaining NaN values (if necessary)
pakistan.dropna(inplace=True)

print("Missing values after handling:")
print(pakistan.isnull().sum())

# Encode Categorical Variables
pakistan['Location Description'] = pd.factorize(pakistan['Location Description'])[0]
pakistan['Domestic'] = pakistan['Domestic'].map({True: 1, False: 0})
pakistan['Arrest'] = pakistan['Arrest'].map({True: 1, False: 0})

# Check Class Distribution
print("Class distribution before handling:")
print(pakistan['Primary Type'].value_counts())

# Handle Rare Classes
y_counts = pakistan['Primary Type'].value_counts()
rare_classes = y_counts[y_counts < 2].index

# Option 1: Remove Rare Classes
pakistan = pakistan[~pakistan['Primary Type'].isin(rare_classes)]

# Option 2: Combine Rare Classes into 'Other'
pakistan['Primary Type'] = pakistan['Primary Type'].apply(
    lambda x: x if y_counts[x] >= 2 else 'Other'
)

# Define Features and Target
X = pakistan[['Latitude', 'Longitude', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Location Description', 'Domestic', 'Arrest']]
y = pakistan['Primary Type']

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nTop Features Contributing to Predictions:\n")
print(feature_importance_df.head(10))

  chicago = pd.read_csv("/content/drive/MyDrive/final_expanded_dataset.csv")


Missing values before handling:
Unnamed: 0                    0
Primary Type                  0
Description                   0
Location Description          0
Arrest                        0
Domestic                      0
District                      0
Year                    6677442
Latitude                      0
Longitude                     0
Month                   6677442
Day                     6677442
Hour                    6677442
Minute                  6677442
dtype: int64
Missing values after handling:
Unnamed: 0              0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
District                0
Year                    0
Latitude                0
Longitude               0
Month                   0
Day                     0
Hour                    0
Minute                  0
dtype: int64
Class distribution before handling:
Primary Type
THEFT        2108803
NARCOTICS    2103015
BATTERY 