<a href="https://colab.research.google.com/github/jansiddiqui/Learning-MachineLearning/blob/main/Crime_Prediction_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# ------------------------------------------------------------
# STEP 1: LOAD DATA
# ------------------------------------------------------------
df = pd.read_csv("/content/final_cleaned_filtered_female_crime.csv")

# Convert date column to datetime
df["Date Reported"] = pd.to_datetime(df["Date Reported"], errors='coerce')
df["Date of Occurrence"] = pd.to_datetime(df["Date of Occurrence"], errors='coerce')

# ------------------------------------------------------------
# STEP 2: FEATURE ENGINEERING
# ------------------------------------------------------------

# A: Weekend feature
df['is_weekend'] = df['Date of Occurrence'].dt.dayofweek >= 5

# B: Month
df['month'] = df['Date of Occurrence'].dt.month

# C: Week of year
df['week_of_year'] = df['Date of Occurrence'].dt.isocalendar().week

# D: Latitude & Longitude bucketization (FIXED)
df['lat_bucket'] = pd.qcut(df['Latitude'], q=10, labels=False, duplicates='drop')
df['lon_bucket'] = pd.qcut(df['Longitude'], q=10, labels=False, duplicates='drop')

# E: Crime keywords ‚Üí binary features
df['is_rape'] = df['Crime Description'].str.contains("rape", case=False, na=False).astype(int)
df['is_assault'] = df['Crime Description'].str.contains("assault", case=False, na=False).astype(int)
df['is_harass'] = df['Crime Description'].str.contains("harass", case=False, na=False).astype(int)

# ------------------------------------------------------------
# STEP 3: DROP USELESS / LEAKAGE COLUMNS
# ------------------------------------------------------------
df = df.drop([
    "Report Number",
    "Date Reported",
    "Date of Occurrence",
    "Time of Occurrence",
    "Date Case Closed",
    "severity_score",     # leakage
    "is_night"            # leakage
], axis=1)

# ------------------------------------------------------------
# STEP 4: ENCODE TARGET LABEL
# ------------------------------------------------------------
encoder = LabelEncoder()
df["safety_label"] = encoder.fit_transform(df["safety_label"])

# ------------------------------------------------------------
# STEP 5: FEATURES AND TARGET
# ------------------------------------------------------------
X = df.drop("safety_label", axis=1)
y = df["safety_label"]

# ------------------------------------------------------------
# STEP 6: NUMERIC & CATEGORICAL FEATURES
# ------------------------------------------------------------
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = ['City', 'Crime Domain', 'Weapon Used', 'Victim Gender']

# ------------------------------------------------------------
# STEP 7: PIPELINES FOR PREPROCESSING
# ------------------------------------------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# ------------------------------------------------------------
# STEP 8: FINAL PIPELINE (LOGISTIC REGRESSION)
# ------------------------------------------------------------
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('log_reg', LogisticRegression(
        max_iter=10000,
        class_weight='balanced',
        solver='liblinear'
    ))
])

# ------------------------------------------------------------
# STEP 9: TRAIN / TEST SPLIT
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------
# STEP 10: HYPERPARAMETER TUNING (Grid Search)
# ------------------------------------------------------------
param_grid = {
    'log_reg__C': [0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

# ------------------------------------------------------------
# STEP 11: FINAL MODEL EVALUATION
# ------------------------------------------------------------
best_model = grid.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

print("\nImproved Logistic Regression Results")
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))


Best Parameters: {'log_reg__C': 0.1}

Improved Logistic Regression Results
Training Accuracy: 0.9493254543427361
Testing Accuracy: 0.9460423634336678

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2283
           1       0.96      0.93      0.94      2202

    accuracy                           0.95      4485
   macro avg       0.95      0.95      0.95      4485
weighted avg       0.95      0.95      0.95      4485



In [14]:
print("\nüìå DATA USAGE SUMMARY\n")

print("Original dataset size:", len(df))
print("Training size:", len(X_train))
print("Testing size:", len(X_test))

print("\nTraining Percentage:", (len(X_train)/len(df))*100, "%")
print("Testing Percentage:", (len(X_test)/len(df))*100, "%")

print("\nTraining class distribution:")
print(y_train.value_counts(normalize=True) * 100)

print("\nTesting class distribution:")
print(y_test.value_counts(normalize=True) * 100)



üìå DATA USAGE SUMMARY

Original dataset size: 22423
Training size: 17938
Testing size: 4485

Training Percentage: 79.99821611737947 %
Testing Percentage: 20.001783882620526 %

Training class distribution:
safety_label
0    51.075928
1    48.924072
Name: proportion, dtype: float64

Testing class distribution:
safety_label
0    50.90301
1    49.09699
Name: proportion, dtype: float64


In [16]:
# ---------------------------
# PREDICT FOR SINGLE INSTANCE
# ---------------------------
input_data = {
    # Original categorical features
    "City": "Delhi",
    "Crime Domain": "Public Place",
    "Weapon Used": "Knife",
    "Victim Gender": "Female",

    # Original numeric features
    "Victim Age": 23,
    "Crime Code": 102,
    "Police Deployed": 2,
    "Latitude": 28.7041,
    "Longitude": 77.1025,
    "hour_of_day": 22,   # 10 PM

    # Feature-engineered columns
    "is_weekend": 0,
    "month": 5,
    "week_of_year": 18,
    "lat_bucket": 5,
    "lon_bucket": 4,

    # Crime description keywords
    "is_rape": 0,
    "is_assault": 1,
    "is_harass": 1
}


# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Make prediction
prediction = best_model.predict(input_df)
prediction_proba = best_model.predict_proba(input_df)

# Show result
print("Prediction output:", prediction)

if prediction[0] == 0:
    print("Prediction: ‚ùå UNSAFE AREA")
else:
    print("Prediction: ‚úÖ SAFE AREA")

print("Probability (Unsafe/Safe):", prediction_proba)


Prediction output: [1]
Prediction: ‚úÖ SAFE AREA
Probability (Unsafe/Safe): [[8.27447093e-04 9.99172553e-01]]


