In [10]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
import os
import warnings

warnings.filterwarnings('ignore')
print("Libraries imported.")

# Create directories if they don't exist
os.makedirs('../data/processed', exist_ok=True)

Libraries imported.


In [11]:
RAW_DATA_PATH = '../data/raw/crime_data.csv'

# Verify the file exists
if not os.path.exists(RAW_DATA_PATH):
    print(f"Error: File not found at {RAW_DATA_PATH}")
    print("Please make sure 'crime_data.csv' is in the 'data/raw/' folder.")
else:
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"Successfully loaded data. Original shape: {df.shape}")
    
    # --- FIX: CLEAN COLUMN NAMES ---
    
    # 1. Strip whitespace from all column names
    df.columns = df.columns.str.strip()
    
    # 2. Replace remaining spaces with underscores and convert to lowercase
    # This ensures columns like 'Victim Age' become 'victim_age'
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    print("Column names standardized.")
    df.head()

Successfully loaded data. Original shape: (40160, 14)
Column names standardized.


In [12]:
print("Handling missing values...")

# Fill categorical NaNs
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')

# Fill numerical NaNs
# **UPDATED COLUMN NAMES:** 'Victim Age' -> 'victim_age', 'Police Deployed' -> 'police_deployed'
num_cols = ['victim_age', 'police_deployed']
for col in num_cols:
    if col in df.columns:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        
print("Missing values handled.")
df.info()

Handling missing values...
Missing values handled.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40160 entries, 0 to 40159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   report_number       40160 non-null  int64 
 1   date_reported       40160 non-null  object
 2   date_of_occurrence  40160 non-null  object
 3   time_of_occurrence  40160 non-null  object
 4   city                40160 non-null  object
 5   crime_code          40160 non-null  int64 
 6   crime_description   40160 non-null  object
 7   victim_age          40160 non-null  int64 
 8   victim_gender       40160 non-null  object
 9   weapon_used         40160 non-null  object
 10  crime_domain        40160 non-null  object
 11  police_deployed     40160 non-null  int64 
 12  case_closed         40160 non-null  object
 13  date_case_closed    40160 non-null  object
dtypes: int64(4), object(10)
memory usage: 4.3+ MB


In [13]:
import pandas as pd
import os
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderUnavailable # Import necessary exception

# Re-load the data and clean columns if you haven't run previous cells in this session
# This ensures 'city' column exists and is clean
# df = pd.read_csv('../data/raw/crime_data.csv')
# df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

print("Starting geocoding... This will take a while.")

# Initialize Nominatim API with a longer timeout (10 seconds)
geolocator = Nominatim(user_agent="geo-crime-predictor-v3", timeout=10)

# Increase the error_wait_seconds to 10 to give the server more time to recover from load
# Removed 'max_tries' as it caused a TypeError
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, error_wait_seconds=10)

location_cache = {}

# Get unique cities to minimize API calls (using the standardized 'city' column)
unique_cities = df['city'].unique()
print(f"Found {len(unique_cities)} unique cities to geocode.")

for city in unique_cities:
    if city == 'unknown':
        location_cache[city] = (None, None)
        continue
    
    query = f"{city}, India"
    try:
        # The RateLimiter handles the slow pace and retries here
        location = geocode(query)
        if location:
            location_cache[city] = (location.latitude, location.longitude)
        else:
            # Prints a message for cities that couldn't be found
            print(f"Warning: Could not find location for '{city}'")
            location_cache[city] = (None, None)
    except GeocoderUnavailable as e:
        # This catches connection errors after all retries fail
        print(f"Critical Geocoding Error (Unavailable) for {city}: {e}")
        location_cache[city] = (None, None)
    except Exception as e:
        # Catch any other unexpected errors
        print(f"Critical Unexpected Error geocoding {city}: {e}")
        location_cache[city] = (None, None)

print("Geocoding API calls complete. Applying to DataFrame...")

# Map the cached locations back to the DataFrame
df['latitude'] = df['city'].map(lambda x: location_cache.get(x, (None, None))[0])
df['longitude'] = df['city'].map(lambda x: location_cache.get(x, (None, None))[1])

# Drop rows where geocoding failed
original_rows = len(df)

Starting geocoding... This will take a while.
Found 29 unique cities to geocode.
Geocoding API calls complete. Applying to DataFrame...


In [14]:
# --- Final Preprocessing: Feature Engineering and Encoding ---
print("\n--- Starting Final Feature Engineering and Encoding ---")

# 1. Feature Engineering: Extracting Time Features from 'date_reported'
# We will use the 'date_reported' column as a proxy for the incident's date/time context.
df['date_reported'] = pd.to_datetime(df['date_reported'], format='%m-%d-%Y %H:%M', errors='coerce')

df['report_hour'] = df['date_reported'].dt.hour
df['report_day_of_week'] = df['date_reported'].dt.dayofweek
df['report_month'] = df['date_reported'].dt.month

# 2. Categorical Encoding (One-Hot Encoding)
# We need to convert our main categorical features into numerical format.
categorical_cols = ['victim_gender', 'weapon_used', 'crime_domain', 'case_closed']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, prefix=categorical_cols)

print("Categorical encoding complete. New columns added.")

# 3. Dropping Redundant Columns
# Drop original columns that are now redundant or not useful for the model.
columns_to_drop = [
    'report_number',        # Unique ID, not a feature
    'date_reported',        # Extracted features from this
    'date_of_occurrence',   # Too complex/redundant with 'date_reported' for simple model
    'time_of_occurrence',   # Not consistent and too noisy
    'city',                 # Replaced by Latitude/Longitude
    'crime_code',           # Replaced by 'crime_description'
    'crime_description',    # The target variable (will be handled in the next notebook)
    'date_case_closed',     # Not directly predictive of the crime itself
]

df_processed = df.drop(columns=columns_to_drop, errors='ignore')

# Rename the geocoded columns for consistency (already done in my previous response, but confirming here)
df_processed = df_processed.rename(columns={'Latitude': 'latitude', 'Longitude': 'longitude'})

# Display final feature set and shape
print(f"Final processed DataFrame shape: {df_processed.shape}")
print(f"Final features: {df_processed.columns.tolist()}")

# --- Saving Processed Data ---
print("\n--- Saving Processed Data ---")

# Create the data/processed directory if it doesn't exist
output_dir = '../data/processed'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the output file path
output_file = os.path.join(output_dir, 'processed_crime_data.csv')

# Save the processed DataFrame
df_processed.to_csv(output_file, index=False)

print(f"Successfully saved processed data to: {output_file}")


--- Starting Final Feature Engineering and Encoding ---
Categorical encoding complete. New columns added.
Final processed DataFrame shape: (40160, 19)
Final features: ['victim_age', 'police_deployed', 'latitude', 'longitude', 'report_hour', 'report_day_of_week', 'report_month', 'victim_gender_M', 'victim_gender_X', 'weapon_used_Explosives', 'weapon_used_Firearm', 'weapon_used_Knife', 'weapon_used_Other', 'weapon_used_Poison', 'weapon_used_Unknown', 'crime_domain_Other Crime', 'crime_domain_Traffic Fatality', 'crime_domain_Violent Crime', 'case_closed_Yes']

--- Saving Processed Data ---
Successfully saved processed data to: ../data/processed\processed_crime_data.csv
